From e1b66b34ea92a29349de07bc209c8f85d680effc Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 7 May 2013 15:56:41 +0200 Subject: [PATCH] Don't fail hard if broken analysis is used. Today an analysis chain with broken tokenfilters or tokenizers like WordDelimiterFilter might produce somewhat broken term vectors that cause `StringIndexOutOfBoundsExceptions` if FastVectorHighlighter is used since the positions / offsets contract is violated and offsets of highlight tokens are not increasing but decreasing even if their positions are increasing. Yet, if we detect such a situation we can resort the tokens which might cause somewhat odd highlights but doesn't fail hard with a StringIndexOOBException. Closes #3006 --- .../FragmentBuilderHelper.java | 102 +++++++++++++++ .../SourceScoreOrderFragmentsBuilder.java | 24 +++- .../SourceSimpleFragmentsBuilder.java | 10 ++ .../hamcrest/ElasticsearchAssertions.java | 6 +- .../highlight/HighlighterSearchTests.java | 117 ++++++++++++++---- 5 files changed, 225 insertions(+), 34 deletions(-) create mode 100644 src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java new file mode 100644 index 00000000000..91c015f6b85 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java @@ -0,0 +1,102 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.elasticsearch.search.highlight.vectorhighlight; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; +import org.elasticsearch.index.analysis.CustomAnalyzer; +import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory; +import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; +import org.elasticsearch.index.analysis.NGramTokenFilterFactory; +import org.elasticsearch.index.analysis.NGramTokenizerFactory; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; +import org.elasticsearch.index.mapper.FieldMapper; + +/** + * Simple helper class for {@link FastVectorHighlighter} {@link FragmentsBuilder} implemenations. + */ +public final class FragmentBuilderHelper { + + private FragmentBuilderHelper() { + // no instance + } + + /** + * Fixes problems with broken analysis chains if positions and offsets are messed up that can lead to + * {@link StringIndexOutOfBoundsException} in the {@link FastVectorHighlighter} + */ + public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) { + assert fragInfo != null : "FragInfo must not be null"; + assert mapper.names().indexName().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name(); + if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.indexAnalyzer()))) { + /* This is a special case where broken analysis like WDF is used for term-vector creation at index-time + * which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort + * the fragments based on their offsets rather than using soley the positions as it is done in + * the FastVectorHighlighter. Yet, this is really a lucene problem and should be fixed in lucene rather + * than in this hack... aka. "we are are working on in!" */ + final List subInfos = fragInfo.getSubInfos(); + Collections.sort(subInfos, new Comparator() { + @Override + public int compare(SubInfo o1, SubInfo o2) { + int startOffset = o1.getTermsOffsets().get(0).getStartOffset(); + int startOffset2 = o2.getTermsOffsets().get(0).getStartOffset(); + return Integer.compare(startOffset, startOffset2); + } + }); + return new WeightedFragInfo(Math.min(fragInfo.getSubInfos().get(0).getTermsOffsets().get(0).getStartOffset(), + fragInfo.getStartOffset()), fragInfo.getEndOffset(), subInfos, fragInfo.getTotalBoost()); + } else { + return fragInfo; + } + } + + private static boolean containsBrokenAnalysis(Analyzer analyzer) { + // TODO maybe we need a getter on Namedanalyzer that tells if this uses broken Analysis + if (analyzer instanceof NamedAnalyzer) { + analyzer = ((NamedAnalyzer) analyzer).analyzer(); + } + if (analyzer instanceof CustomAnalyzer) { + final CustomAnalyzer a = (CustomAnalyzer) analyzer; + if (a.tokenizerFactory() instanceof EdgeNGramTokenizerFactory) { + return true; + } + TokenFilterFactory[] tokenFilters = a.tokenFilters(); + for (TokenFilterFactory tokenFilterFactory : tokenFilters) { + if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory + || tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) { + return true; + } + } + } + return false; + } +} diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java index 6e25ec3ff1e..72d3c24763d 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java @@ -19,29 +19,38 @@ package org.elasticsearch.search.highlight.vectorhighlight; +import java.io.IOException; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ngram.NGramTokenizerFactory; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.vectorhighlight.BoundaryScanner; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; +import org.elasticsearch.index.analysis.CustomAnalyzer; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.lookup.SearchLookup; -import java.io.IOException; -import java.util.List; - /** * */ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder { - private final FieldMapper mapper; + private final FieldMapper mapper; private final SearchContext searchContext; - public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext, + public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext, String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) { super(preTags, postTags, boundaryScanner); this.mapper = mapper; @@ -62,4 +71,9 @@ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder } return fields; } + + protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, + String[] preTags, String[] postTags, Encoder encoder ){ + return super.makeFragment(buffer, index, values, FragmentBuilderHelper.fixWeightedFragInfo(mapper, values, fragInfo), preTags, postTags, encoder); + } } diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java index f1fecb57047..c2f1237b7a4 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java @@ -23,13 +23,18 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.lookup.SearchLookup; import java.io.IOException; +import java.util.Collections; +import java.util.Comparator; import java.util.List; /** @@ -67,5 +72,10 @@ public class SourceSimpleFragmentsBuilder extends SimpleFragmentsBuilder { } return fields; } + + protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, + String[] preTags, String[] postTags, Encoder encoder ){ + return super.makeFragment(buffer, index, values, FragmentBuilderHelper.fixWeightedFragInfo(mapper, values, fragInfo), preTags, postTags, encoder); + } } diff --git a/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java b/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java index 1266b4a7759..b3fb9ff8ff5 100644 --- a/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java +++ b/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java @@ -21,6 +21,8 @@ package org.elasticsearch.test.hamcrest; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.*; +import java.util.Arrays; + import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.search.SearchHit; import org.hamcrest.Matcher; @@ -61,8 +63,8 @@ public class ElasticsearchAssertions { } public static void assertHighlight(SearchResponse resp, int hit, String field, int fragment, Matcher matcher) { - assertThat(resp.getShardFailures().length, equalTo(0)); - assertThat(resp.getHits().hits().length, greaterThan(hit)); + assertThat("Unexpectd ShardFailures: " + Arrays.toString(resp.getShardFailures()), resp.getShardFailures().length, equalTo(0)); + assertThat("not enough hits", resp.getHits().hits().length, greaterThan(hit)); assertThat(resp.getHits().hits()[hit].getHighlightFields().get(field), notNullValue()); assertThat(resp.getHits().hits()[hit].getHighlightFields().get(field).fragments().length, greaterThan(fragment)); assertThat(resp.getHits().hits()[hit].highlightFields().get(field).fragments()[fragment].string(), matcher); diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index e3c6c6988e7..bdf985baf5d 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -19,12 +19,31 @@ package org.elasticsearch.test.integration.search.highlight; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.vectorhighlight.FieldQuery; +import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH; +import static org.elasticsearch.client.Requests.searchRequest; +import static org.elasticsearch.common.unit.TimeValue.timeValueMinutes; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.query.QueryBuilders.boolQuery; +import static org.elasticsearch.index.query.QueryBuilders.boostingQuery; +import static org.elasticsearch.index.query.QueryBuilders.commonTerms; +import static org.elasticsearch.index.query.QueryBuilders.constantScoreQuery; +import static org.elasticsearch.index.query.QueryBuilders.fieldQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchPhrasePrefixQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchQuery; +import static org.elasticsearch.index.query.QueryBuilders.prefixQuery; +import static org.elasticsearch.index.query.QueryBuilders.termQuery; +import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight; +import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.testng.Assert.fail; + +import java.io.IOException; +import java.util.Arrays; + import org.elasticsearch.ElasticSearchException; import org.elasticsearch.action.search.SearchPhaseExecutionException; import org.elasticsearch.action.search.SearchResponse; @@ -37,6 +56,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.query.FilterBuilders; import org.elasticsearch.index.query.MatchQueryBuilder; +import org.elasticsearch.index.query.MatchQueryBuilder.Operator; import org.elasticsearch.index.query.MatchQueryBuilder.Type; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.indices.IndexMissingException; @@ -45,31 +65,10 @@ import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.highlight.HighlightBuilder; import org.elasticsearch.test.integration.AbstractNodesTests; -import org.hamcrest.Matcher; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.IOException; -import java.util.Arrays; - -import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH; -import static org.elasticsearch.client.Requests.searchRequest; -import static org.elasticsearch.common.unit.TimeValue.timeValueMinutes; -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; -import static org.elasticsearch.index.query.QueryBuilders.*; -import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight; -import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.greaterThan; -import static org.hamcrest.Matchers.notNullValue; - - -import static org.hamcrest.Matchers.instanceOf; -import static org.testng.Assert.fail; - /** * */ @@ -94,6 +93,70 @@ public class HighlighterSearchTests extends AbstractNodesTests { return client("server1"); } + @Test + public void testNgramHighlightingWithBrokenPositions() throws ElasticSearchException, IOException { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + + client.admin().indices().prepareCreate("test") + .addMapping("test", jsonBuilder() + .startObject() + .startObject("test") + .startObject("properties") + .startObject("name") + .startObject("fields") + .startObject("autocomplete") + .field("type", "string") + .field("index_analyzer", "autocomplete") + .field("search_analyzer", "search_autocomplete") + .field("term_vector", "with_positions_offsets") + .endObject() + .startObject("name") + .field("type", "string") + .endObject() + .endObject() + .field("type", "multi_field") + .endObject() + .endObject() + .endObject()) + .setSettings(ImmutableSettings.settingsBuilder() + .put("index.number_of_shards", 1) + .put("analysis.tokenizer.autocomplete.max_gram", 20) + .put("analysis.tokenizer.autocomplete.min_gram", 1) + .put("analysis.tokenizer.autocomplete.type", "nGram") + .put("analysis.filter.wordDelimiter.type", "word_delimiter") + .putArray("analysis.filter.wordDelimiter.type_table", + "& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM", + "? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM", + "+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM", + "^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM", + "] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM") + + .put("analysis.filter.wordDelimiter.type.split_on_numerics", false) + .put("analysis.filter.wordDelimiter.generate_word_parts", true) + .put("analysis.filter.wordDelimiter.generate_number_parts", false) + .put("analysis.filter.wordDelimiter.catenate_words", true) + .put("analysis.filter.wordDelimiter.catenate_numbers", true) + .put("analysis.filter.wordDelimiter.catenate_all", false) + + .put("analysis.analyzer.autocomplete.tokenizer", "autocomplete") + .putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter") + .put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace") + .putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")) + .execute().actionGet(); + client.prepareIndex("test", "test", "1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field( "name", "ARCOTEL Hotels Deutschland") + .endObject()) + .setRefresh(true).execute().actionGet(); + SearchResponse search = client.prepareSearch("test").setTypes("test").setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR)).addHighlightedField("name.autocomplete").execute().actionGet(); + assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCOTEL Hotels Deutschland")); + } + @Test public void testNgramHighlighting() throws ElasticSearchException, IOException {