LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as when using CachingTokenStream

SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, regardless of hl.maxDocCharsToAnalyze

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1095260 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2011-04-20 03:43:27 +00:00
parent ccf84e155e
commit ec366d2c52
8 changed files with 39 additions and 9 deletions

View File

@ -45,10 +45,15 @@ API Changes
======================= Lucene 3.x (not yet released) =======================
Bug fixes
Bug Fixes
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
* LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in
WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as
when using CachingTokenStream. This can be a significant performance bug for
large documents. (Mark Miller)
New Features

View File

@ -197,6 +197,11 @@ public class Highlighter
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
if (fragmentScorer instanceof QueryScorer) {
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
}
TokenStream newStream = fragmentScorer.init(tokenStream);
if(newStream != null) {
tokenStream = newStream;

View File

@ -0,0 +1 @@
package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; /** * This TokenFilter limits the number of tokens while indexing by adding up the * current offset. */ public final class OffsetLimitTokenFilter extends TokenFilter { private int offsetCount; private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class); private int offsetLimit; public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) { super(input); this.offsetLimit = offsetLimit; } @Override public boolean incrementToken() throws IOException { if (offsetCount < offsetLimit && input.incrementToken()) { int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset(); offsetCount += offsetLength; return true; } return false; } @Override public void reset() throws IOException { super.reset(); offsetCount = 0; } }

View File

@ -54,6 +54,7 @@ public class QueryScorer implements Scorer {
private IndexReader reader;
private boolean skipInitExtractor;
private boolean wrapToCaching = true;
private int maxCharsToAnalyze;
/**
* @param query Query to use for highlighting
@ -209,7 +210,7 @@ public class QueryScorer implements Scorer {
private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
: new WeightedSpanTermExtractor(defaultField);
qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
qse.setExpandMultiTermQuery(expandMultiTermQuery);
qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
if (reader == null) {
@ -265,4 +266,8 @@ public class QueryScorer implements Scorer {
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
this.maxCharsToAnalyze = maxDocCharsToAnalyze;
}
}

View File

@ -56,6 +56,7 @@ public class WeightedSpanTermExtractor {
private boolean expandMultiTermQuery;
private boolean cachedTokenStream;
private boolean wrapToCaching = true;
private int maxDocCharsToAnalyze;
public WeightedSpanTermExtractor() {
}
@ -320,13 +321,13 @@ public class WeightedSpanTermExtractor {
private AtomicReaderContext getLeafContextForField(String field) throws IOException {
if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
tokenStream = new CachingTokenFilter(tokenStream);
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
}
AtomicReaderContext context = readers.get(field);
if (context == null) {
MemoryIndex indexer = new MemoryIndex();
indexer.addField(field, tokenStream);
indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
tokenStream.reset();
IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
@ -545,4 +546,8 @@ public class WeightedSpanTermExtractor {
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
}
}

View File

@ -0,0 +1 @@
package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase { public void testFilter() throws Exception { TokenStream stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10); assertTokenStreamContents(filter, new String[] {"short", "toolong"}); stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); filter = new OffsetLimitTokenFilter(stream, 12); assertTokenStreamContents(filter, new String[] {"short", "toolong"}); stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); filter = new OffsetLimitTokenFilter(stream, 30); assertTokenStreamContents(filter, new String[] {"short", "toolong", "evenmuchlongertext"}); checkOneTermReuse(new Analyzer() { @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new OffsetLimitTokenFilter(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), 10); } }, "llenges", "llenges"); } }

View File

@ -278,6 +278,9 @@ Bug Fixes
* SOLR-2333: The "rename" core admin action does not persist the new name to solr.xml
(Rasmus Hahn, Paul R. Brown via Mark Miller)
* SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents,
regardless of hl.maxDocCharsToAnalyze. (Mark Miller)
Other Changes
----------------------

View File

@ -435,12 +435,20 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
// fall back to analyzer
tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
}
int maxCharsToAnalyze = params.getFieldInt(fieldName,
HighlightParams.MAX_CHARS,
Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
Highlighter highlighter;
if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
// TODO: this is not always necessary - eventually we would like to avoid this wrap
// when it is not needed.
tstream = new CachingTokenFilter(tstream);
if (maxCharsToAnalyze < 0) {
tstream = new CachingTokenFilter(tstream);
} else {
tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
@ -453,9 +461,6 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
highlighter = getHighlighter(query, fieldName, req);
}
int maxCharsToAnalyze = params.getFieldInt(fieldName,
HighlightParams.MAX_CHARS,
Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
if (maxCharsToAnalyze < 0) {
highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
} else {