LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as when using CachingTokenStream

SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, regardless of hl.maxDocCharsToAnalyze git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1095260 13f79535-47bb-0310-9956-ffa450edef68
2011-04-20 03:43:27 +00:00 · 2011-04-20 03:43:27 +00:00 · ec366d2c52
parent ccf84e155e
commit ec366d2c52
8 changed files with 39 additions and 9 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -45,10 +45,15 @@ API Changes

 ======================= Lucene 3.x (not yet released) =======================

-Bug fixes
+Bug Fixes

 * LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
   on sentences longer than 32,767 characters.  (wangzhenghang via Robert Muir)
+   
+ * LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in 
+   WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as 
+   when using CachingTokenStream. This can be a significant performance bug for
+   large documents. (Mark Miller)

 New Features

--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -197,6 +197,11 @@ public class Highlighter
 	    tokenStream.reset();
 	    
 		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
+		
+    if (fragmentScorer instanceof QueryScorer) {
+      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
+    }
+    
 		TokenStream newStream = fragmentScorer.init(tokenStream);
 		if(newStream != null) {
 		  tokenStream = newStream;
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
@ -0,0 +1 @@
+package org.apache.lucene.search.highlight;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

/**
 * This TokenFilter limits the number of tokens while indexing by adding up the
 * current offset.
 */
public final class OffsetLimitTokenFilter extends TokenFilter {
  
  private int offsetCount;
  private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
  private int offsetLimit;
  
  public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
    super(input);
    this.offsetLimit = offsetLimit;
  }
  
  @Override
  public boolean incrementToken() throws IOException {
    if (offsetCount < offsetLimit && input.incrementToken()) {
      int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
      offsetCount += offsetLength;
      return true;
    }
    return false;
  }
  
  @Override
  public void reset() throws IOException {
    super.reset();
    offsetCount = 0;
  }
  
}
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
@ -54,6 +54,7 @@ public class QueryScorer implements Scorer {
  private IndexReader reader;
  private boolean skipInitExtractor;
  private boolean wrapToCaching = true;
+  private int maxCharsToAnalyze;

  /**
   * @param query Query to use for highlighting
@ -209,7 +210,7 @@ public class QueryScorer implements Scorer {
  private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
    WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
        : new WeightedSpanTermExtractor(defaultField);
-
+    qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
    qse.setExpandMultiTermQuery(expandMultiTermQuery);
    qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
    if (reader == null) {
@ -265,4 +266,8 @@ public class QueryScorer implements Scorer {
  public void setWrapIfNotCachingTokenFilter(boolean wrap) {
    this.wrapToCaching = wrap;
  }
+
+  public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+    this.maxCharsToAnalyze = maxDocCharsToAnalyze;
+  }
 }
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -56,6 +56,7 @@ public class WeightedSpanTermExtractor {
  private boolean expandMultiTermQuery;
  private boolean cachedTokenStream;
  private boolean wrapToCaching = true;
+  private int maxDocCharsToAnalyze;

  public WeightedSpanTermExtractor() {
  }
@ -320,13 +321,13 @@ public class WeightedSpanTermExtractor {

  private AtomicReaderContext getLeafContextForField(String field) throws IOException {
    if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
-      tokenStream = new CachingTokenFilter(tokenStream);
+      tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
      cachedTokenStream = true;
    }
    AtomicReaderContext context = readers.get(field);
    if (context == null) {
      MemoryIndex indexer = new MemoryIndex();
-      indexer.addField(field, tokenStream);
+      indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
      tokenStream.reset();
      IndexSearcher searcher = indexer.createSearcher();
      // MEM index has only atomic ctx
@ -545,4 +546,8 @@ public class WeightedSpanTermExtractor {
  public void setWrapIfNotCachingTokenFilter(boolean wrap) {
    this.wrapToCaching = wrap;
  }
+
+  protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+    this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
+  }
 }
--- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
+++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
@ -0,0 +1 @@
+package org.apache.lucene.search.highlight;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;

public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
  
  public void testFilter() throws Exception {
    TokenStream stream = new MockTokenizer(new StringReader(
        "short toolong evenmuchlongertext a ab toolong foo"),
        MockTokenizer.WHITESPACE, false);
    OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
    assertTokenStreamContents(filter, new String[] {"short", "toolong"});
    
    stream = new MockTokenizer(new StringReader(
    "short toolong evenmuchlongertext a ab toolong foo"),
    MockTokenizer.WHITESPACE, false);
    filter = new OffsetLimitTokenFilter(stream, 12);
    assertTokenStreamContents(filter, new String[] {"short", "toolong"});
    
    stream = new MockTokenizer(new StringReader(
        "short toolong evenmuchlongertext a ab toolong foo"),
        MockTokenizer.WHITESPACE, false);
    filter = new OffsetLimitTokenFilter(stream, 30);
    assertTokenStreamContents(filter, new String[] {"short", "toolong",
        "evenmuchlongertext"});
    
    
    checkOneTermReuse(new Analyzer() {
      
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new OffsetLimitTokenFilter(new MockTokenizer(reader,
            MockTokenizer.WHITESPACE, false), 10);
      }
    }, "llenges", "llenges");
  }
}
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -278,6 +278,9 @@ Bug Fixes

 * SOLR-2333: The "rename" core admin action does not persist the new name to solr.xml
  (Rasmus Hahn, Paul R. Brown via Mark Miller)
+  
+* SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, 
+  regardless of hl.maxDocCharsToAnalyze. (Mark Miller)

 Other Changes
 ----------------------
--- a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
+++ b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
@ -435,12 +435,20 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
        // fall back to analyzer
        tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
      }
-                   
+      
+      int maxCharsToAnalyze = params.getFieldInt(fieldName,
+          HighlightParams.MAX_CHARS,
+          Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
+      
      Highlighter highlighter;
      if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
        // TODO: this is not always necessary - eventually we would like to avoid this wrap
        //       when it is not needed.
-        tstream = new CachingTokenFilter(tstream);
+        if (maxCharsToAnalyze < 0) {
+          tstream = new CachingTokenFilter(tstream);
+        } else {
+          tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
+        }
        
        // get highlighter
        highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
@ -453,9 +461,6 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
        highlighter = getHighlighter(query, fieldName, req);
      }
      
-      int maxCharsToAnalyze = params.getFieldInt(fieldName,
-          HighlightParams.MAX_CHARS,
-          Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
      if (maxCharsToAnalyze < 0) {
        highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
      } else {
				`@ -0,0 +1 @@`
				package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. / import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; /* * This TokenFilter limits the number of tokens while indexing by adding up the * current offset. */ public final class OffsetLimitTokenFilter extends TokenFilter { private int offsetCount; private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class); private int offsetLimit; public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) { super(input); this.offsetLimit = offsetLimit; } @Override public boolean incrementToken() throws IOException { if (offsetCount < offsetLimit && input.incrementToken()) { int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset(); offsetCount += offsetLength; return true; } return false; } @Override public void reset() throws IOException { super.reset(); offsetCount = 0; } }