Don't fail hard if broken analysis is used.

Today an analysis chain with broken tokenfilters or tokenizers like
WordDelimiterFilter might produce somewhat broken term vectors that cause
`StringIndexOutOfBoundsExceptions` if FastVectorHighlighter is used
since the positions / offsets contract is violated and offsets of highlight
tokens are not increasing but decreasing even if their positions are increasing.

Yet, if we detect such a situation we can resort the tokens which might cause
somewhat odd highlights but doesn't fail hard with a StringIndexOOBException.

Closes #3006
This commit is contained in:
Simon Willnauer 2013-05-07 15:56:41 +02:00
parent 14ae2fb765
commit e1b66b34ea
5 changed files with 225 additions and 34 deletions

View File

@ -0,0 +1,102 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.highlight.vectorhighlight;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.mapper.FieldMapper;
/**
* Simple helper class for {@link FastVectorHighlighter} {@link FragmentsBuilder} implemenations.
*/
public final class FragmentBuilderHelper {
private FragmentBuilderHelper() {
// no instance
}
/**
* Fixes problems with broken analysis chains if positions and offsets are messed up that can lead to
* {@link StringIndexOutOfBoundsException} in the {@link FastVectorHighlighter}
*/
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper<?> mapper, Field[] values, WeightedFragInfo fragInfo) {
assert fragInfo != null : "FragInfo must not be null";
assert mapper.names().indexName().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.indexAnalyzer()))) {
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
* the fragments based on their offsets rather than using soley the positions as it is done in
* the FastVectorHighlighter. Yet, this is really a lucene problem and should be fixed in lucene rather
* than in this hack... aka. "we are are working on in!" */
final List<SubInfo> subInfos = fragInfo.getSubInfos();
Collections.sort(subInfos, new Comparator<SubInfo>() {
@Override
public int compare(SubInfo o1, SubInfo o2) {
int startOffset = o1.getTermsOffsets().get(0).getStartOffset();
int startOffset2 = o2.getTermsOffsets().get(0).getStartOffset();
return Integer.compare(startOffset, startOffset2);
}
});
return new WeightedFragInfo(Math.min(fragInfo.getSubInfos().get(0).getTermsOffsets().get(0).getStartOffset(),
fragInfo.getStartOffset()), fragInfo.getEndOffset(), subInfos, fragInfo.getTotalBoost());
} else {
return fragInfo;
}
}
private static boolean containsBrokenAnalysis(Analyzer analyzer) {
// TODO maybe we need a getter on Namedanalyzer that tells if this uses broken Analysis
if (analyzer instanceof NamedAnalyzer) {
analyzer = ((NamedAnalyzer) analyzer).analyzer();
}
if (analyzer instanceof CustomAnalyzer) {
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
if (a.tokenizerFactory() instanceof EdgeNGramTokenizerFactory) {
return true;
}
TokenFilterFactory[] tokenFilters = a.tokenFilters();
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory
|| tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) {
return true;
}
}
}
return false;
}
}

View File

@ -19,29 +19,38 @@
package org.elasticsearch.search.highlight.vectorhighlight;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ngram.NGramTokenizerFactory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.lookup.SearchLookup;
import java.io.IOException;
import java.util.List;
/**
*
*/
public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder {
private final FieldMapper mapper;
private final FieldMapper<?> mapper;
private final SearchContext searchContext;
public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
public SourceScoreOrderFragmentsBuilder(FieldMapper<?> mapper, SearchContext searchContext,
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
super(preTags, postTags, boundaryScanner);
this.mapper = mapper;
@ -62,4 +71,9 @@ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder
}
return fields;
}
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
String[] preTags, String[] postTags, Encoder encoder ){
return super.makeFragment(buffer, index, values, FragmentBuilderHelper.fixWeightedFragInfo(mapper, values, fragInfo), preTags, postTags, encoder);
}
}

View File

@ -23,13 +23,18 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.lookup.SearchLookup;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
@ -68,4 +73,9 @@ public class SourceSimpleFragmentsBuilder extends SimpleFragmentsBuilder {
return fields;
}
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
String[] preTags, String[] postTags, Encoder encoder ){
return super.makeFragment(buffer, index, values, FragmentBuilderHelper.fixWeightedFragInfo(mapper, values, fragInfo), preTags, postTags, encoder);
}
}

View File

@ -21,6 +21,8 @@ package org.elasticsearch.test.hamcrest;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
import java.util.Arrays;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.search.SearchHit;
import org.hamcrest.Matcher;
@ -61,8 +63,8 @@ public class ElasticsearchAssertions {
}
public static void assertHighlight(SearchResponse resp, int hit, String field, int fragment, Matcher<String> matcher) {
assertThat(resp.getShardFailures().length, equalTo(0));
assertThat(resp.getHits().hits().length, greaterThan(hit));
assertThat("Unexpectd ShardFailures: " + Arrays.toString(resp.getShardFailures()), resp.getShardFailures().length, equalTo(0));
assertThat("not enough hits", resp.getHits().hits().length, greaterThan(hit));
assertThat(resp.getHits().hits()[hit].getHighlightFields().get(field), notNullValue());
assertThat(resp.getHits().hits()[hit].getHighlightFields().get(field).fragments().length, greaterThan(fragment));
assertThat(resp.getHits().hits()[hit].highlightFields().get(field).fragments()[fragment].string(), matcher);

View File

@ -19,12 +19,31 @@
package org.elasticsearch.test.integration.search.highlight;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH;
import static org.elasticsearch.client.Requests.searchRequest;
import static org.elasticsearch.common.unit.TimeValue.timeValueMinutes;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.boostingQuery;
import static org.elasticsearch.index.query.QueryBuilders.commonTerms;
import static org.elasticsearch.index.query.QueryBuilders.constantScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.fieldQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchPhrasePrefixQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.prefixQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight;
import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.testng.Assert.fail;
import java.io.IOException;
import java.util.Arrays;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.action.search.SearchPhaseExecutionException;
import org.elasticsearch.action.search.SearchResponse;
@ -37,6 +56,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder.Operator;
import org.elasticsearch.index.query.MatchQueryBuilder.Type;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.indices.IndexMissingException;
@ -45,31 +65,10 @@ import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.highlight.HighlightBuilder;
import org.elasticsearch.test.integration.AbstractNodesTests;
import org.hamcrest.Matcher;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.IOException;
import java.util.Arrays;
import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH;
import static org.elasticsearch.client.Requests.searchRequest;
import static org.elasticsearch.common.unit.TimeValue.timeValueMinutes;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.*;
import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight;
import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.instanceOf;
import static org.testng.Assert.fail;
/**
*
*/
@ -94,6 +93,70 @@ public class HighlighterSearchTests extends AbstractNodesTests {
return client("server1");
}
@Test
public void testNgramHighlightingWithBrokenPositions() throws ElasticSearchException, IOException {
try {
client.admin().indices().prepareDelete("test").execute().actionGet();
} catch (Exception e) {
// ignore
}
client.admin().indices().prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("name")
.startObject("fields")
.startObject("autocomplete")
.field("type", "string")
.field("index_analyzer", "autocomplete")
.field("search_analyzer", "search_autocomplete")
.field("term_vector", "with_positions_offsets")
.endObject()
.startObject("name")
.field("type", "string")
.endObject()
.endObject()
.field("type", "multi_field")
.endObject()
.endObject()
.endObject())
.setSettings(ImmutableSettings.settingsBuilder()
.put("index.number_of_shards", 1)
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putArray("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM",
"+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM",
"^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
.putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
.putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter"))
.execute().actionGet();
client.prepareIndex("test", "test", "1")
.setSource(XContentFactory.jsonBuilder()
.startObject()
.field( "name", "ARCOTEL Hotels Deutschland")
.endObject())
.setRefresh(true).execute().actionGet();
SearchResponse search = client.prepareSearch("test").setTypes("test").setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR)).addHighlightedField("name.autocomplete").execute().actionGet();
assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
}
@Test
public void testNgramHighlighting() throws ElasticSearchException, IOException {