mirror of https://github.com/apache/lucene.git
LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as when using CachingTokenStream
SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, regardless of hl.maxDocCharsToAnalyze git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1095260 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ccf84e155e
commit
ec366d2c52
|
@ -45,10 +45,15 @@ API Changes
|
|||
|
||||
======================= Lucene 3.x (not yet released) =======================
|
||||
|
||||
Bug fixes
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
|
||||
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
|
||||
|
||||
* LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in
|
||||
WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as
|
||||
when using CachingTokenStream. This can be a significant performance bug for
|
||||
large documents. (Mark Miller)
|
||||
|
||||
New Features
|
||||
|
||||
|
|
|
@ -197,6 +197,11 @@ public class Highlighter
|
|||
tokenStream.reset();
|
||||
|
||||
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
||||
|
||||
if (fragmentScorer instanceof QueryScorer) {
|
||||
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
|
||||
}
|
||||
|
||||
TokenStream newStream = fragmentScorer.init(tokenStream);
|
||||
if(newStream != null) {
|
||||
tokenStream = newStream;
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* This TokenFilter limits the number of tokens while indexing by adding up the
* current offset.
*/
public final class OffsetLimitTokenFilter extends TokenFilter {
private int offsetCount;
private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
private int offsetLimit;
public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
super(input);
this.offsetLimit = offsetLimit;
}
@Override
public boolean incrementToken() throws IOException {
if (offsetCount < offsetLimit && input.incrementToken()) {
int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
offsetCount += offsetLength;
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
offsetCount = 0;
}
}
|
|
@ -54,6 +54,7 @@ public class QueryScorer implements Scorer {
|
|||
private IndexReader reader;
|
||||
private boolean skipInitExtractor;
|
||||
private boolean wrapToCaching = true;
|
||||
private int maxCharsToAnalyze;
|
||||
|
||||
/**
|
||||
* @param query Query to use for highlighting
|
||||
|
@ -209,7 +210,7 @@ public class QueryScorer implements Scorer {
|
|||
private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
|
||||
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
|
||||
: new WeightedSpanTermExtractor(defaultField);
|
||||
|
||||
qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
|
||||
qse.setExpandMultiTermQuery(expandMultiTermQuery);
|
||||
qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
|
||||
if (reader == null) {
|
||||
|
@ -265,4 +266,8 @@ public class QueryScorer implements Scorer {
|
|||
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
||||
this.wrapToCaching = wrap;
|
||||
}
|
||||
|
||||
public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
|
||||
this.maxCharsToAnalyze = maxDocCharsToAnalyze;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,6 +56,7 @@ public class WeightedSpanTermExtractor {
|
|||
private boolean expandMultiTermQuery;
|
||||
private boolean cachedTokenStream;
|
||||
private boolean wrapToCaching = true;
|
||||
private int maxDocCharsToAnalyze;
|
||||
|
||||
public WeightedSpanTermExtractor() {
|
||||
}
|
||||
|
@ -320,13 +321,13 @@ public class WeightedSpanTermExtractor {
|
|||
|
||||
private AtomicReaderContext getLeafContextForField(String field) throws IOException {
|
||||
if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
|
||||
tokenStream = new CachingTokenFilter(tokenStream);
|
||||
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
cachedTokenStream = true;
|
||||
}
|
||||
AtomicReaderContext context = readers.get(field);
|
||||
if (context == null) {
|
||||
MemoryIndex indexer = new MemoryIndex();
|
||||
indexer.addField(field, tokenStream);
|
||||
indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
tokenStream.reset();
|
||||
IndexSearcher searcher = indexer.createSearcher();
|
||||
// MEM index has only atomic ctx
|
||||
|
@ -545,4 +546,8 @@ public class WeightedSpanTermExtractor {
|
|||
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
||||
this.wrapToCaching = wrap;
|
||||
}
|
||||
|
||||
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
|
||||
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
TokenStream stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 12);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 30);
assertTokenStreamContents(filter, new String[] {"short", "toolong",
"evenmuchlongertext"});
checkOneTermReuse(new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new OffsetLimitTokenFilter(new MockTokenizer(reader,
MockTokenizer.WHITESPACE, false), 10);
}
}, "llenges", "llenges");
}
}
|
|
@ -278,6 +278,9 @@ Bug Fixes
|
|||
|
||||
* SOLR-2333: The "rename" core admin action does not persist the new name to solr.xml
|
||||
(Rasmus Hahn, Paul R. Brown via Mark Miller)
|
||||
|
||||
* SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents,
|
||||
regardless of hl.maxDocCharsToAnalyze. (Mark Miller)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -435,12 +435,20 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
|||
// fall back to analyzer
|
||||
tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
|
||||
}
|
||||
|
||||
|
||||
int maxCharsToAnalyze = params.getFieldInt(fieldName,
|
||||
HighlightParams.MAX_CHARS,
|
||||
Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
|
||||
|
||||
Highlighter highlighter;
|
||||
if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
|
||||
// TODO: this is not always necessary - eventually we would like to avoid this wrap
|
||||
// when it is not needed.
|
||||
tstream = new CachingTokenFilter(tstream);
|
||||
if (maxCharsToAnalyze < 0) {
|
||||
tstream = new CachingTokenFilter(tstream);
|
||||
} else {
|
||||
tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
|
||||
}
|
||||
|
||||
// get highlighter
|
||||
highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
|
||||
|
@ -453,9 +461,6 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
|||
highlighter = getHighlighter(query, fieldName, req);
|
||||
}
|
||||
|
||||
int maxCharsToAnalyze = params.getFieldInt(fieldName,
|
||||
HighlightParams.MAX_CHARS,
|
||||
Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
|
||||
if (maxCharsToAnalyze < 0) {
|
||||
highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
|
||||
} else {
|
||||
|
|
Loading…
Reference in New Issue