diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragListBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragListBuilder.java new file mode 100644 index 00000000000..86494a676f0 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragListBuilder.java @@ -0,0 +1,168 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; +import org.apache.lucene.util.Version; +import org.elasticsearch.common.lucene.Lucene; + +/** + * A non-abstract copy of the abstract {@link BaseFragListBuilder}. that works + * in the same way as {@link SimpleFragListBuilder} but isn't prone to the + * problem of negative offsets. This is fixed in Lucene 4.3 and this class + * should be removed once Elasticsearch upgraded to Lucene 4.3 + *

+ * LUCENE-4899: FastVectorHighlihgter failed with + * {@link StringIndexOutOfBoundsException} if a single highlight phrase or term was + * greater than the fragCharSize producing negative string offsets + *

+ */ +public final class XSimpleFragListBuilder implements FragListBuilder { + + public static final int MARGIN_DEFAULT = 6; + public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3; + + final int margin; + final int minFragCharSize; + static { + assert Version.LUCENE_42 == Lucene.VERSION: "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed"; + } + + public XSimpleFragListBuilder(int margin) { + if (margin < 0) + throw new IllegalArgumentException("margin(" + margin + ") is too small. It must be 0 or higher."); + + this.margin = margin; + this.minFragCharSize = Math.max(1, margin * MIN_FRAG_CHAR_SIZE_FACTOR); + } + + public XSimpleFragListBuilder() { + this(MARGIN_DEFAULT); + } + + @Override + public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) { + return createFieldFragList(fieldPhraseList, new SimpleFieldFragList(fragCharSize), fragCharSize); + } + + protected FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize) { + if (fragCharSize < minFragCharSize) + throw new IllegalArgumentException("fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher."); + + List wpil = new ArrayList(); + IteratorQueue queue = new IteratorQueue(fieldPhraseList.getPhraseList().iterator()); + WeightedPhraseInfo phraseInfo = null; + int startOffset = 0; + while ((phraseInfo = queue.top()) != null) { + // if the phrase violates the border of previous fragment, discard + // it and try next phrase + if (phraseInfo.getStartOffset() < startOffset) { + queue.removeTop(); + continue; + } + + wpil.clear(); + final int currentPhraseStartOffset = phraseInfo.getStartOffset(); + int currentPhraseEndOffset = phraseInfo.getEndOffset(); + int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); + int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); + } + while ((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd + if (phraseInfo.getEndOffset() <= spanEnd) { + currentPhraseEndOffset = phraseInfo.getEndOffset(); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); + } + } else { + break; + } + } + if (wpil.isEmpty()) { + continue; + } + + final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset; + // now recalculate the start and end position to "center" the result + final int newMargin = Math.max(0, (fragCharSize - matchLen) / 2); + // matchLen can be > fragCharSize prevent IAOOB here + spanStart = currentPhraseStartOffset - newMargin; + if (spanStart < startOffset) { + spanStart = startOffset; + } + // whatever is bigger here we grow this out + spanEnd = spanStart + Math.max(matchLen, fragCharSize); + startOffset = spanEnd; + fieldFragList.add(spanStart, spanEnd, wpil); + } + return fieldFragList; + } + + /** + * A predicate to decide if the given {@link WeightedPhraseInfo} should be + * accepted as a highlighted phrase or if it should be discarded. + *

+ * The default implementation discards phrases that are composed of more + * than one term and where the matchLength exceeds the fragment character + * size. + * + * @param info + * the phrase info to accept + * @param matchLength + * the match length of the current phrase + * @param fragCharSize + * the configured fragment character size + * @return true if this phrase info should be accepted as a + * highligh phrase + */ + protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) { + return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize; + } + + private static final class IteratorQueue { + private final Iterator iter; + private T top; + + public IteratorQueue(Iterator iter) { + this.iter = iter; + T removeTop = removeTop(); + assert removeTop == null; + } + + public T top() { + return top; + } + + public T removeTop() { + T currentTop = top; + if (iter.hasNext()) { + top = iter.next(); + } else { + top = null; + } + return currentTop; + } + + } + +} diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java index 6bef7751641..28d3e64ad46 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java @@ -267,11 +267,7 @@ public class HighlightPhase extends AbstractComponent implements FetchSubPhase { fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); } } else { - if (field.fragmentOffset() == -1) - fragListBuilder = new SimpleFragListBuilder(); - else - fragListBuilder = new SimpleFragListBuilder(field.fragmentOffset()); - + fragListBuilder = field.fragmentOffset() == -1 ? new XSimpleFragListBuilder() : new XSimpleFragListBuilder(field.fragmentOffset()); if (field.scoreOrdered()) { if (mapper.fieldType().stored()) { fragmentsBuilder = new XScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index e1a8ddd0e56..65ffabec45a 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -19,6 +19,12 @@ package org.elasticsearch.test.integration.search.highlight; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.vectorhighlight.FieldQuery; import org.elasticsearch.ElasticSearchException; import org.elasticsearch.action.search.SearchPhaseExecutionException; import org.elasticsearch.action.search.SearchResponse; @@ -31,6 +37,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.query.FilterBuilders; import org.elasticsearch.index.query.MatchQueryBuilder; +import org.elasticsearch.index.query.MatchQueryBuilder.Type; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.indices.IndexMissingException; import org.elasticsearch.rest.RestStatus; @@ -81,6 +88,69 @@ public class HighlighterSearchTests extends AbstractNodesTests { return client("server1"); } + + @Test + public void testEnsureNoNegativeOffsets() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + + client.admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2)) + .addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties") + // we don't store title, now lets see if it works... + .startObject("no_long_term").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").endObject() + .startObject("long_term").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").endObject() + .endObject().endObject().endObject()) + .execute().actionGet(); + + + client.prepareIndex("test", "type1", "1") + .setSource(XContentFactory.jsonBuilder().startObject() + .startArray("no_long_term") + .value("This is a test where foo is highlighed and should be highlighted") + .endArray() + .startArray("long_term") + .value("This is a test thisisaverylongwordandmakessurethisfails where foo is highlighed and should be highlighted") + .endArray() + .endObject()) + .setRefresh(true).execute().actionGet(); + + + SearchResponse search = client.prepareSearch() + .setQuery(matchQuery("long_term", "thisisaverylongwordandmakessurethisfails foo highlighed")) + .addHighlightedField("long_term", 18, 1) + .execute().actionGet(); + + assertThat(search.getHits().totalHits(), equalTo(1l)); + assertThat(search.getHits().hits().length, equalTo(1)); + + assertThat(search.getHits().hits()[0].highlightFields().get("long_term").fragments().length, equalTo(1)); + assertThat(search.getHits().hits()[0].highlightFields().get("long_term").fragments()[0].string(), equalTo("thisisaverylongwordandmakessurethisfails")); + + + search = client.prepareSearch() + .setQuery(matchQuery("no_long_term", "test foo highlighed").type(Type.PHRASE).slop(3)) + .addHighlightedField("no_long_term", 18, 1).setHighlighterPostTags("").setHighlighterPreTags("") + .execute().actionGet(); + assertThat(search.getHits().totalHits(), equalTo(1l)); + assertThat(search.getHits().hits().length, equalTo(1)); + assertThat(search.getHits().hits()[0].highlightFields().size(), equalTo(0)); + + + search = client.prepareSearch() + .setQuery(matchQuery("no_long_term", "test foo highlighed").type(Type.PHRASE).slop(3)) + .addHighlightedField("no_long_term", 30, 1).setHighlighterPostTags("").setHighlighterPreTags("") + .execute().actionGet(); + + assertThat(search.getHits().totalHits(), equalTo(1l)); + assertThat(search.getHits().hits().length, equalTo(1)); + + assertThat(search.getHits().hits()[0].highlightFields().get("no_long_term").fragments().length, equalTo(1)); + assertThat(search.getHits().hits()[0].highlightFields().get("no_long_term").fragments()[0].string(), equalTo("a test where foo is highlighed and")); + } + @Test public void testSourceLookupHighlightingUsingPlainHighlighter() throws Exception { try {