Bug fix for AnnotatedTextHighlighter - port of 39525 (#39749)

Bug fix for AnnotatedTextHighlighter - port of 39525

Relates to #39395
This commit is contained in:
markharwood 2019-03-06 19:02:04 +00:00 committed by GitHub
parent d094107592
commit 1873de5240
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 153 additions and 65 deletions

View File

@ -54,6 +54,7 @@ import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.mapper.TextFieldMapper;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
import java.io.IOException;
import java.io.Reader;
@ -315,45 +316,12 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
// When asked to tokenize plain-text versions by the highlighter it tokenizes the
// original markup form in order to inject annotations.
public static final class AnnotatedHighlighterAnalyzer extends AnalyzerWrapper {
private Analyzer delegate;
private AnnotatedText[] annotations;
public AnnotatedHighlighterAnalyzer(Analyzer delegate){
private final Analyzer delegate;
private final HitContext hitContext;
public AnnotatedHighlighterAnalyzer(Analyzer delegate, HitContext hitContext){
super(delegate.getReuseStrategy());
this.delegate = delegate;
}
public void init(String[] markedUpFieldValues) {
this.annotations = new AnnotatedText[markedUpFieldValues.length];
for (int i = 0; i < markedUpFieldValues.length; i++) {
annotations[i] = AnnotatedText.parse(markedUpFieldValues[i]);
}
}
public String [] getPlainTextValuesForHighlighter(){
String [] result = new String[annotations.length];
for (int i = 0; i < annotations.length; i++) {
result[i] = annotations[i].textMinusMarkup;
}
return result;
}
public AnnotationToken[] getIntersectingAnnotations(int start, int end) {
List<AnnotationToken> intersectingAnnotations = new ArrayList<>();
int fieldValueOffset =0;
for (AnnotatedText fieldValueAnnotations : this.annotations) {
//This is called from a highlighter where all of the field values are concatenated
// so each annotation offset will need to be adjusted so that it takes into account
// the previous values AND the MULTIVAL delimiter
for (AnnotationToken token : fieldValueAnnotations.annotations) {
if(token.intersects(start - fieldValueOffset , end - fieldValueOffset)) {
intersectingAnnotations.add(new AnnotationToken(token.offset + fieldValueOffset,
token.endOffset + fieldValueOffset, token.value));
}
}
//add 1 for the fieldvalue separator character
fieldValueOffset +=fieldValueAnnotations.textMinusMarkup.length() +1;
}
return intersectingAnnotations.toArray(new AnnotationToken[intersectingAnnotations.size()]);
this.hitContext = hitContext;
}
@Override
@ -364,10 +332,11 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream());
AnnotatedText[] annotations = (AnnotatedText[]) hitContext.cache().get(AnnotatedText.class.getName());
AtomicInteger readerNum = new AtomicInteger(0);
return new TokenStreamComponents(r -> {
String plainText = readToString(r);
AnnotatedText at = this.annotations[readerNum.getAndIncrement()];
AnnotatedText at = annotations[readerNum.getAndIncrement()];
assert at.textMinusMarkup.equals(plainText);
injector.setAnnotations(at);
components.getSource().accept(new StringReader(at.textMinusMarkup));

View File

@ -23,7 +23,7 @@ import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.Snippet;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken;
import java.io.UnsupportedEncodingException;
@ -42,11 +42,11 @@ public class AnnotatedPassageFormatter extends PassageFormatter {
public static final String SEARCH_HIT_TYPE = "_hit_term";
private final Encoder encoder;
private AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer;
AnnotatedText[] annotations;
public AnnotatedPassageFormatter(AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer, Encoder encoder) {
this.annotatedHighlighterAnalyzer = annotatedHighlighterAnalyzer;
public AnnotatedPassageFormatter(AnnotatedText[] annotations, Encoder encoder) {
this.encoder = encoder;
this.annotations = annotations;
}
static class MarkupPassage {
@ -158,7 +158,7 @@ public class AnnotatedPassageFormatter extends PassageFormatter {
int pos;
int j = 0;
for (Passage passage : passages) {
AnnotationToken [] annotations = annotatedHighlighterAnalyzer.getIntersectingAnnotations(passage.getStartOffset(),
AnnotationToken [] annotations = getIntersectingAnnotations(passage.getStartOffset(),
passage.getEndOffset());
MarkupPassage mergedMarkup = mergeAnnotations(annotations, passage);
@ -194,6 +194,27 @@ public class AnnotatedPassageFormatter extends PassageFormatter {
}
return snippets;
}
public AnnotationToken[] getIntersectingAnnotations(int start, int end) {
List<AnnotationToken> intersectingAnnotations = new ArrayList<>();
int fieldValueOffset =0;
for (AnnotatedText fieldValueAnnotations : this.annotations) {
//This is called from a highlighter where all of the field values are concatenated
// so each annotation offset will need to be adjusted so that it takes into account
// the previous values AND the MULTIVAL delimiter
for (int i = 0; i < fieldValueAnnotations.numAnnotations(); i++) {
AnnotationToken token = fieldValueAnnotations.getAnnotation(i);
if (token.intersects(start - fieldValueOffset, end - fieldValueOffset)) {
intersectingAnnotations
.add(new AnnotationToken(token.offset + fieldValueOffset, token.endOffset +
fieldValueOffset, token.value));
}
}
//add 1 for the fieldvalue separator character
fieldValueOffset +=fieldValueAnnotations.textMinusMarkup.length() +1;
}
return intersectingAnnotations.toArray(new AnnotationToken[intersectingAnnotations.size()]);
}
private void append(StringBuilder dest, String content, int start, int end) {
dest.append(encoder.encodeText(content.substring(start, end)));

View File

@ -25,24 +25,22 @@ import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
public class AnnotatedTextHighlighter extends UnifiedHighlighter {
public static final String NAME = "annotated";
AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer = null;
@Override
protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) {
annotatedHighlighterAnalyzer = new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper, type));
return annotatedHighlighterAnalyzer;
protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type, HitContext hitContext) {
return new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper, type, hitContext), hitContext);
}
// Convert the marked-up values held on-disk to plain-text versions for highlighting
@ -51,14 +49,26 @@ public class AnnotatedTextHighlighter extends UnifiedHighlighter {
throws IOException {
List<Object> fieldValues = super.loadFieldValues(fieldType, field, context, hitContext);
String[] fieldValuesAsString = fieldValues.toArray(new String[fieldValues.size()]);
annotatedHighlighterAnalyzer.init(fieldValuesAsString);
return Arrays.asList((Object[]) annotatedHighlighterAnalyzer.getPlainTextValuesForHighlighter());
AnnotatedText[] annotations = new AnnotatedText[fieldValuesAsString.length];
for (int i = 0; i < fieldValuesAsString.length; i++) {
annotations[i] = AnnotatedText.parse(fieldValuesAsString[i]);
}
// Store the annotations in the hitContext
hitContext.cache().put(AnnotatedText.class.getName(), annotations);
ArrayList<Object> result = new ArrayList<>(annotations.length);
for (int i = 0; i < annotations.length; i++) {
result.add(annotations[i].textMinusMarkup);
}
return result;
}
@Override
protected PassageFormatter getPassageFormatter(SearchContextHighlight.Field field, Encoder encoder) {
return new AnnotatedPassageFormatter(annotatedHighlighterAnalyzer, encoder);
protected PassageFormatter getPassageFormatter(HitContext hitContext,SearchContextHighlight.Field field, Encoder encoder) {
// Retrieve the annotations from the hitContext
AnnotatedText[] annotations = (AnnotatedText[]) hitContext.cache().get(AnnotatedText.class.getName());
return new AnnotatedPassageFormatter(annotations, encoder);
}
}

View File

@ -40,18 +40,20 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.Strings;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper;
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedPassageFormatter;
import org.elasticsearch.test.ESTestCase;
import java.net.URLEncoder;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Locale;
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
@ -63,13 +65,24 @@ public class AnnotatedTextHighlighterTests extends ESTestCase {
Query query, Locale locale, BreakIterator breakIterator,
int noMatchSize, String[] expectedPassages) throws Exception {
// Annotated fields wrap the usual analyzer with one that injects extra tokens
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
hiliteAnalyzer.init(markedUpInputs);
PassageFormatter passageFormatter = new AnnotatedPassageFormatter(hiliteAnalyzer,new DefaultEncoder());
String []plainTextForHighlighter = hiliteAnalyzer.getPlainTextValuesForHighlighter();
HitContext mockHitContext = new HitContext();
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer, mockHitContext);
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
for (int i = 0; i < markedUpInputs.length; i++) {
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
}
mockHitContext.cache().put(AnnotatedText.class.getName(), annotations);
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(annotations,new DefaultEncoder());
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
for (int i = 0; i < annotations.length; i++) {
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
}
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
@ -94,7 +107,7 @@ public class AnnotatedTextHighlighterTests extends ESTestCase {
iw.close();
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits.value, equalTo(1L));
String rawValue = Strings.arrayToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, hiliteAnalyzer, null,
passageFormatter, locale,

View File

@ -42,3 +42,77 @@
body: { "query" : {"term" : { "text" : "quick" } }, "highlight" : { "type" : "annotated", "require_field_match": false, "fields" : { "text" : {} } } }
- match: {hits.hits.0.highlight.text.0: "The [quick](_hit_term=quick) brown fox is brown."}
---
"issue 39395 thread safety issue -requires multiple calls to reveal":
- skip:
version: " - 6.4.99"
reason: Annotated text type introduced in 6.5.0
- do:
indices.create:
index: annotated
body:
settings:
number_of_shards: "5"
number_of_replicas: "0"
mappings:
properties:
my_field:
type: annotated_text
- do:
index:
index: annotated
id: 1
body:
"my_field" : "[A](~MARK0&~MARK0) [B](~MARK1)"
- do:
index:
index: annotated
id: 2
body:
"my_field" : "[A](~MARK0) [C](~MARK2)"
refresh: true
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}
- do:
search:
request_cache: false
body: { "query" : {"match_phrase" : { "my_field" : {"query": "~MARK0", "analyzer": "whitespace"} } }, "highlight" : { "type" : "annotated", "fields" : { "my_field" : {} } } }
- match: {_shards.failed: 0}

View File

@ -74,7 +74,6 @@ public interface FetchSubPhase {
}
return cache;
}
}
/**

View File

@ -38,6 +38,7 @@ import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
@ -68,12 +69,13 @@ public class UnifiedHighlighter implements Highlighter {
int numberOfFragments;
try {
final Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType);
final Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType,
hitContext);
List<Object> fieldValues = loadFieldValues(fieldType, field, context, hitContext);
if (fieldValues.size() == 0) {
return null;
}
final PassageFormatter passageFormatter = getPassageFormatter(field, encoder);
final PassageFormatter passageFormatter = getPassageFormatter(hitContext, field, encoder);
final IndexSearcher searcher = new IndexSearcher(hitContext.reader());
final CustomUnifiedHighlighter highlighter;
final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
@ -140,14 +142,14 @@ public class UnifiedHighlighter implements Highlighter {
return null;
}
protected PassageFormatter getPassageFormatter(SearchContextHighlight.Field field, Encoder encoder) {
protected PassageFormatter getPassageFormatter(HitContext hitContext, SearchContextHighlight.Field field, Encoder encoder) {
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
field.fieldOptions().postTags()[0], encoder);
return passageFormatter;
}
protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) {
protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type, HitContext hitContext) {
return HighlightUtils.getAnalyzer(docMapper, type);
}