diff --git a/docs/plugins/mapper-annotated-text.asciidoc b/docs/plugins/mapper-annotated-text.asciidoc new file mode 100644 index 00000000000..4528168a4d6 --- /dev/null +++ b/docs/plugins/mapper-annotated-text.asciidoc @@ -0,0 +1,328 @@ +[[mapper-annotated-text]] +=== Mapper Annotated Text Plugin + +experimental[] + +The mapper-annotated-text plugin provides the ability to index text that is a +combination of free-text and special markup that is typically used to identify +items of interest such as people or organisations (see NER or Named Entity Recognition +tools). + + +The elasticsearch markup allows one or more additional tokens to be injected, unchanged, into the token +stream at the same position as the underlying text it annotates. + +:plugin_name: mapper-annotated-text +include::install_remove.asciidoc[] + +[[mapper-annotated-text-usage]] +==== Using the `annotated-text` field + +The `annotated-text` tokenizes text content as per the more common `text` field (see +"limitations" below) but also injects any marked-up annotation tokens directly into +the search index: + +[source,js] +-------------------------- +PUT my_index +{ + "mappings": { + "_doc": { + "properties": { + "my_field": { + "type": "annotated_text" + } + } + } + } +} +-------------------------- +// CONSOLE + +Such a mapping would allow marked-up text eg wikipedia articles to be indexed as both text +and structured tokens. The annotations use a markdown-like syntax using URL encoding of +one or more values separated by the `&` symbol. + + +We can use the "_analyze" api to test how an example annotation would be stored as tokens +in the search index: + + +[source,js] +-------------------------- +GET my_index/_analyze +{ + "field": "my_field", + "text":"Investors in [Apple](Apple+Inc.) rejoiced." +} +-------------------------- +// NOTCONSOLE + +Response: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "investors", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "in", + "start_offset": 10, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "Apple Inc.", <1> + "start_offset": 13, + "end_offset": 18, + "type": "annotation", + "position": 2 + }, + { + "token": "apple", + "start_offset": 13, + "end_offset": 18, + "type": "", + "position": 2 + }, + { + "token": "rejoiced", + "start_offset": 19, + "end_offset": 27, + "type": "", + "position": 3 + } + ] +} +-------------------------------------------------- +// NOTCONSOLE + +<1> Note the whole annotation token `Apple Inc.` is placed, unchanged as a single token in +the token stream and at the same position (position 2) as the text token (`apple`) it annotates. + + +We can now perform searches for annotations using regular `term` queries that don't tokenize +the provided search values. Annotations are a more precise way of matching as can be seen +in this example where a search for `Beck` will not match `Jeff Beck` : + +[source,js] +-------------------------- +# Example documents +PUT my_index/_doc/1 +{ + "my_field": "[Beck](Beck) announced a new tour"<2> +} + +PUT my_index/_doc/2 +{ + "my_field": "[Jeff Beck](Jeff+Beck&Guitarist) plays a strat"<1> +} + +# Example search +GET my_index/_search +{ + "query": { + "term": { + "my_field": "Beck" <3> + } + } +} +-------------------------- +// CONSOLE + +<1> As well as tokenising the plain text into single words e.g. `beck`, here we +inject the single token value `Beck` at the same position as `beck` in the token stream. +<2> Note annotations can inject multiple tokens at the same position - here we inject both +the very specific value `Jeff Beck` and the broader term `Guitarist`. This enables +broader positional queries e.g. finding mentions of a `Guitarist` near to `strat`. +<3> A benefit of searching with these carefully defined annotation tokens is that a query for +`Beck` will not match document 2 that contains the tokens `jeff`, `beck` and `Jeff Beck` + +WARNING: Any use of `=` signs in annotation values eg `[Prince](person=Prince)` will +cause the document to be rejected with a parse failure. In future we hope to have a use for +the equals signs so wil actively reject documents that contain this today. + + +[[mapper-annotated-text-tips]] +==== Data modelling tips +===== Use structured and unstructured fields + +Annotations are normally a way of weaving structured information into unstructured text for +higher-precision search. + +`Entity resolution` is a form of document enrichment undertaken by specialist software or people +where references to entities in a document are disambiguated by attaching a canonical ID. +The ID is used to resolve any number of aliases or distinguish between people with the +same name. The hyperlinks connecting Wikipedia's articles are a good example of resolved +entity IDs woven into text. + +These IDs can be embedded as annotations in an annotated_text field but it often makes +sense to include them in dedicated structured fields to support discovery via aggregations: + +[source,js] +-------------------------- +PUT my_index +{ + "mappings": { + "_doc": { + "properties": { + "my_unstructured_text_field": { + "type": "annotated_text" + }, + "my_structured_people_field": { + "type": "text", + "fields": { + "keyword" :{ + "type": "keyword" + } + } + } + } + } + } +} +-------------------------- +// CONSOLE + +Applications would then typically provide content and discover it as follows: + +[source,js] +-------------------------- +# Example documents +PUT my_index/_doc/1 +{ + "my_unstructured_text_field": "[Shay](%40kimchy) created elasticsearch", + "my_twitter_handles": ["@kimchy"] <1> +} + +GET my_index/_search +{ + "query": { + "query_string": { + "query": "elasticsearch OR logstash OR kibana",<2> + "default_field": "my_unstructured_text_field" + } + }, + "aggregations": { + "top_people" :{ + "significant_terms" : { <3> + "field" : "my_twitter_handles.keyword" + } + } + } +} +-------------------------- +// CONSOLE + +<1> Note the `my_twitter_handles` contains a list of the annotation values +also used in the unstructured text. (Note the annotated_text syntax requires escaping). +By repeating the annotation values in a structured field this application has ensured that +the tokens discovered in the structured field can be used for search and highlighting +in the unstructured field. +<2> In this example we search for documents that talk about components of the elastic stack +<3> We use the `my_twitter_handles` field here to discover people who are significantly +associated with the elastic stack. + +===== Avoiding over-matching annotations +By design, the regular text tokens and the annotation tokens co-exist in the same indexed +field but in rare cases this can lead to some over-matching. + +The value of an annotation often denotes a _named entity_ (a person, place or company). +The tokens for these named entities are inserted untokenized, and differ from typical text +tokens because they are normally: + +* Mixed case e.g. `Madonna` +* Multiple words e.g. `Jeff Beck` +* Can have punctuation or numbers e.g. `Apple Inc.` or `@kimchy` + +This means, for the most part, a search for a named entity in the annotated text field will +not have any false positives e.g. when selecting `Apple Inc.` from an aggregation result +you can drill down to highlight uses in the text without "over matching" on any text tokens +like the word `apple` in this context: + + the apple was very juicy + +However, a problem arises if your named entity happens to be a single term and lower-case e.g. the +company `elastic`. In this case, a search on the annotated text field for the token `elastic` +may match a text document such as this: + + he fired an elastic band + +To avoid such false matches users should consider prefixing annotation values to ensure +they don't name clash with text tokens e.g. + + [elastic](Company_elastic) released version 7.0 of the elastic stack today + + + + +[[mapper-annotated-text-highlighter]] +==== Using the `annotated` highlighter + +The `annotated-text` plugin includes a custom highlighter designed to mark up search hits +in a way which is respectful of the original markup: + +[source,js] +-------------------------- +# Example documents +PUT my_index/_doc/1 +{ + "my_field": "The cat sat on the [mat](sku3578)" +} + +GET my_index/_search +{ + "query": { + "query_string": { + "query": "cats" + } + }, + "highlight": { + "fields": { + "my_field": { + "type": "annotated", <1> + "require_field_match": false + } + } + } +} +-------------------------- +// CONSOLE +<1> The `annotated` highlighter type is designed for use with annotated_text fields + +The annotated highlighter is based on the `unified` highlighter and supports the same +settings but does not use the `pre_tags` or `post_tags` parameters. Rather than using +html-like markup such as `cat` the annotated highlighter uses the same +markdown-like syntax used for annotations and injects a key=value annotation where `_hit_term` +is the key and the matched search term is the value e.g. + + The [cat](_hit_term=cat) sat on the [mat](sku3578) + +The annotated highlighter tries to be respectful of any existing markup in the original +text: + +* If the search term matches exactly the location of an existing annotation then the +`_hit_term` key is merged into the url-like syntax used in the `(...)` part of the +existing annotation. +* However, if the search term overlaps the span of an existing annotation it would break +the markup formatting so the original annotation is removed in favour of a new annotation +with just the search hit information in the results. +* Any non-overlapping annotations in the original text are preserved in highlighter +selections + + +[[mapper-annotated-text-limitations]] +==== Limitations + +The annotated_text field type supports the same mapping settings as the `text` field type +but with the following exceptions: + +* No support for `fielddata` or `fielddata_frequency_filter` +* No support for `index_prefixes` or `index_phrases` indexing diff --git a/docs/plugins/mapper.asciidoc b/docs/plugins/mapper.asciidoc index 226fc4e40d0..4026a45c59e 100644 --- a/docs/plugins/mapper.asciidoc +++ b/docs/plugins/mapper.asciidoc @@ -19,5 +19,13 @@ indexes the size in bytes of the original The mapper-murmur3 plugin allows hashes to be computed at index-time and stored in the index for later use with the `cardinality` aggregation. +<>:: + +The annotated text plugin provides the ability to index text that is a +combination of free-text and special markup that is typically used to identify +items of interest such as people or organisations (see NER or Named Entity Recognition +tools). + include::mapper-size.asciidoc[] include::mapper-murmur3.asciidoc[] +include::mapper-annotated-text.asciidoc[] diff --git a/docs/reference/cat/plugins.asciidoc b/docs/reference/cat/plugins.asciidoc index a9915d7aaa2..9cb83321835 100644 --- a/docs/reference/cat/plugins.asciidoc +++ b/docs/reference/cat/plugins.asciidoc @@ -28,6 +28,7 @@ U7321H6 discovery-gce {version} The Google Compute Engine (GCE) Discov U7321H6 ingest-attachment {version} Ingest processor that uses Apache Tika to extract contents U7321H6 ingest-geoip {version} Ingest processor that uses looksup geo data based on ip adresses using the Maxmind geo database U7321H6 ingest-user-agent {version} Ingest processor that extracts information from a user agent +U7321H6 mapper-annotated-text {version} The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index. U7321H6 mapper-murmur3 {version} The Mapper Murmur3 plugin allows to compute hashes of a field's values at index-time and to store them in the index. U7321H6 mapper-size {version} The Mapper Size plugin allows document to record their uncompressed size at index time. U7321H6 store-smb {version} The Store SMB plugin adds support for SMB stores. diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index fbd8181d095..9cd55bee855 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -35,6 +35,7 @@ string:: <> and <> `completion` to provide auto-complete suggestions <>:: `token_count` to count the number of tokens in a string {plugins}/mapper-murmur3.html[`mapper-murmur3`]:: `murmur3` to compute hashes of values at index-time and store them in the index +{plugins}/mapper-annotated-text.html[`mapper-annotated-text`]:: `annotated-text` to index text containing special markup (typically used for identifying named entities) <>:: Accepts queries from the query-dsl diff --git a/plugins/mapper-annotated-text/build.gradle b/plugins/mapper-annotated-text/build.gradle new file mode 100644 index 00000000000..8ce1ca2a416 --- /dev/null +++ b/plugins/mapper-annotated-text/build.gradle @@ -0,0 +1,23 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +esplugin { + description 'The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index.' + classname 'org.elasticsearch.plugin.mapper.AnnotatedTextPlugin' +} diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java new file mode 100644 index 00000000000..8cc38d130ff --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java @@ -0,0 +1,776 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.annotatedtext; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.AnalyzerWrapper; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.NormsFieldExistsQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.FieldNamesFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.StringFieldType; +import org.elasticsearch.index.mapper.TextFieldMapper; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken; +import org.elasticsearch.index.query.QueryShardContext; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UncheckedIOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.elasticsearch.index.mapper.TypeParsers.parseTextField; + +/** A {@link FieldMapper} for full-text fields with annotation markup e.g. + * + * "New mayor is [John Smith](type=person&value=John%20Smith) " + * + * A special Analyzer wraps the default choice of analyzer in order + * to strip the text field of annotation markup and inject the related + * entity annotation tokens as supplementary tokens at the relevant points + * in the token stream. + * This code is largely a copy of TextFieldMapper which is less than ideal - + * my attempts to subclass TextFieldMapper failed but we can revisit this. + **/ +public class AnnotatedTextFieldMapper extends FieldMapper { + + public static final String CONTENT_TYPE = "annotated_text"; + private static final int POSITION_INCREMENT_GAP_USE_ANALYZER = -1; + + public static class Defaults { + public static final MappedFieldType FIELD_TYPE = new AnnotatedTextFieldType(); + static { + FIELD_TYPE.freeze(); + } + } + + public static class Builder extends FieldMapper.Builder { + + private int positionIncrementGap = POSITION_INCREMENT_GAP_USE_ANALYZER; + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + @Override + public AnnotatedTextFieldType fieldType() { + return (AnnotatedTextFieldType) super.fieldType(); + } + + public Builder positionIncrementGap(int positionIncrementGap) { + if (positionIncrementGap < 0) { + throw new MapperParsingException("[positions_increment_gap] must be positive, got " + positionIncrementGap); + } + this.positionIncrementGap = positionIncrementGap; + return this; + } + + @Override + public Builder docValues(boolean docValues) { + if (docValues) { + throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields do not support doc values"); + } + return super.docValues(docValues); + } + + @Override + public AnnotatedTextFieldMapper build(BuilderContext context) { + if (fieldType().indexOptions() == IndexOptions.NONE ) { + throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields must be indexed"); + } + if (positionIncrementGap != POSITION_INCREMENT_GAP_USE_ANALYZER) { + if (fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + throw new IllegalArgumentException("Cannot set position_increment_gap on field [" + + name + "] without positions enabled"); + } + fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), positionIncrementGap)); + fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), positionIncrementGap)); + fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(), positionIncrementGap)); + } else { + //Using the analyzer's default BUT need to do the same thing AnalysisRegistry.processAnalyzerFactory + // does to splice in new default of posIncGap=100 by wrapping the analyzer + if (fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { + int overrideInc = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP; + fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), overrideInc)); + fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), overrideInc)); + fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(),overrideInc)); + } + } + setupFieldType(context); + return new AnnotatedTextFieldMapper( + name, fieldType(), defaultFieldType, positionIncrementGap, + context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); + } + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + public Mapper.Builder parse( + String fieldName, Map node, ParserContext parserContext) throws MapperParsingException { + AnnotatedTextFieldMapper.Builder builder = new AnnotatedTextFieldMapper.Builder(fieldName); + + builder.fieldType().setIndexAnalyzer(parserContext.getIndexAnalyzers().getDefaultIndexAnalyzer()); + builder.fieldType().setSearchAnalyzer(parserContext.getIndexAnalyzers().getDefaultSearchAnalyzer()); + builder.fieldType().setSearchQuoteAnalyzer(parserContext.getIndexAnalyzers().getDefaultSearchQuoteAnalyzer()); + parseTextField(builder, fieldName, node, parserContext); + for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext();) { + Map.Entry entry = iterator.next(); + String propName = entry.getKey(); + Object propNode = entry.getValue(); + if (propName.equals("position_increment_gap")) { + int newPositionIncrementGap = XContentMapValues.nodeIntegerValue(propNode, -1); + builder.positionIncrementGap(newPositionIncrementGap); + iterator.remove(); + } + } + return builder; + } + } + + + /** + * Parses markdown-like syntax into plain text and AnnotationTokens with offsets for + * annotations found in texts + */ + public static final class AnnotatedText { + public final String textPlusMarkup; + public final String textMinusMarkup; + List annotations; + + // Format is markdown-like syntax for URLs eg: + // "New mayor is [John Smith](type=person&value=John%20Smith) " + static Pattern markdownPattern = Pattern.compile("\\[([^\\]\\[]*)\\]\\(([^\\)\\(]*)\\)"); + + public static AnnotatedText parse (String textPlusMarkup) { + List annotations =new ArrayList<>(); + Matcher m = markdownPattern.matcher(textPlusMarkup); + int lastPos = 0; + StringBuilder sb = new StringBuilder(); + while(m.find()){ + if(m.start() > lastPos){ + sb.append(textPlusMarkup.substring(lastPos, m.start())); + } + + int startOffset = sb.length(); + int endOffset = sb.length() + m.group(1).length(); + sb.append(m.group(1)); + lastPos = m.end(); + + String[] pairs = m.group(2).split("&"); + String value = null; + for (String pair : pairs) { + String[] kv = pair.split("="); + try { + if(kv.length == 2){ + throw new ElasticsearchParseException("key=value pairs are not supported in annotations"); + } + if(kv.length == 1) { + //Check "=" sign wasn't in the pair string + if(kv[0].length() == pair.length()) { + //untyped value + value = URLDecoder.decode(kv[0], "UTF-8"); + } + } + if (value!=null && value.length() > 0) { + annotations.add(new AnnotationToken(startOffset, endOffset, value)); + } + } catch (UnsupportedEncodingException uee){ + throw new ElasticsearchParseException("Unsupported encoding parsing annotated text", uee); + } + } + } + if(lastPos < textPlusMarkup.length()){ + sb.append(textPlusMarkup.substring(lastPos)); + } + return new AnnotatedText(sb.toString(), textPlusMarkup, annotations); + } + + protected AnnotatedText(String textMinusMarkup, String textPlusMarkup, List annotations) { + this.textMinusMarkup = textMinusMarkup; + this.textPlusMarkup = textPlusMarkup; + this.annotations = annotations; + } + + public static final class AnnotationToken { + public final int offset; + public final int endOffset; + + public final String value; + public AnnotationToken(int offset, int endOffset, String value) { + this.offset = offset; + this.endOffset = endOffset; + this.value = value; + } + @Override + public String toString() { + return value +" ("+offset+" - "+endOffset+")"; + } + + public boolean intersects(int start, int end) { + return (start <= offset && end >= offset) || (start <= endOffset && end >= endOffset) + || (start >= offset && end <= endOffset); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + endOffset; + result = prime * result + offset; + result = prime * result + Objects.hashCode(value); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + AnnotationToken other = (AnnotationToken) obj; + return Objects.equals(endOffset, other.endOffset) && Objects.equals(offset, other.offset) + && Objects.equals(value, other.value); + } + + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(textMinusMarkup); + sb.append("\n"); + annotations.forEach(a -> {sb.append(a); sb.append("\n");}); + return sb.toString(); + } + + public int numAnnotations() { + return annotations.size(); + } + + public AnnotationToken getAnnotation(int index) { + return annotations.get(index); + } + } + + // A utility class for use with highlighters where the content being highlighted + // needs plain text format for highlighting but marked-up format for token discovery. + // The class takes markedup format field values and returns plain text versions. + // When asked to tokenize plain-text versions by the highlighter it tokenizes the + // original markup form in order to inject annotations. + public static final class AnnotatedHighlighterAnalyzer extends AnalyzerWrapper { + private Analyzer delegate; + private AnnotatedText[] annotations; + public AnnotatedHighlighterAnalyzer(Analyzer delegate){ + super(delegate.getReuseStrategy()); + this.delegate = delegate; + } + + public void init(String[] markedUpFieldValues) { + this.annotations = new AnnotatedText[markedUpFieldValues.length]; + for (int i = 0; i < markedUpFieldValues.length; i++) { + annotations[i] = AnnotatedText.parse(markedUpFieldValues[i]); + } + } + + public String [] getPlainTextValuesForHighlighter(){ + String [] result = new String[annotations.length]; + for (int i = 0; i < annotations.length; i++) { + result[i] = annotations[i].textMinusMarkup; + } + return result; + } + + public AnnotationToken[] getIntersectingAnnotations(int start, int end) { + List intersectingAnnotations = new ArrayList<>(); + int fieldValueOffset =0; + for (AnnotatedText fieldValueAnnotations : this.annotations) { + //This is called from a highlighter where all of the field values are concatenated + // so each annotation offset will need to be adjusted so that it takes into account + // the previous values AND the MULTIVAL delimiter + for (AnnotationToken token : fieldValueAnnotations.annotations) { + if(token.intersects(start - fieldValueOffset , end - fieldValueOffset)) { + intersectingAnnotations.add(new AnnotationToken(token.offset + fieldValueOffset, + token.endOffset + fieldValueOffset, token.value)); + } + } + //add 1 for the fieldvalue separator character + fieldValueOffset +=fieldValueAnnotations.textMinusMarkup.length() +1; + } + return intersectingAnnotations.toArray(new AnnotationToken[intersectingAnnotations.size()]); + } + + @Override + public Analyzer getWrappedAnalyzer(String fieldName) { + return delegate; + } + + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + if(components instanceof AnnotatedHighlighterTokenStreamComponents){ + // already wrapped. + return components; + } + AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream()); + return new AnnotatedHighlighterTokenStreamComponents(components.getTokenizer(), injector, this.annotations); + } + } + private static final class AnnotatedHighlighterTokenStreamComponents extends TokenStreamComponents{ + + private AnnotationsInjector annotationsInjector; + private AnnotatedText[] annotations; + int readerNum = 0; + + AnnotatedHighlighterTokenStreamComponents(Tokenizer source, AnnotationsInjector annotationsFilter, + AnnotatedText[] annotations) { + super(source, annotationsFilter); + this.annotationsInjector = annotationsFilter; + this.annotations = annotations; + } + + @Override + protected void setReader(Reader reader) { + String plainText = readToString(reader); + AnnotatedText at = this.annotations[readerNum++]; + assert at.textMinusMarkup.equals(plainText); + // This code is reliant on the behaviour of highlighter logic - it + // takes plain text multi-value fields and then calls the same analyzer + // for each field value in turn. This class has cached the annotations + // associated with each plain-text value and are arranged in the same order + annotationsInjector.setAnnotations(at); + super.setReader(new StringReader(at.textMinusMarkup)); + } + + } + + + public static final class AnnotationAnalyzerWrapper extends AnalyzerWrapper { + + + private final Analyzer delegate; + + public AnnotationAnalyzerWrapper (Analyzer delegate) { + super(delegate.getReuseStrategy()); + this.delegate = delegate; + } + + /** + * Wraps {@link StandardAnalyzer}. + */ + public AnnotationAnalyzerWrapper() { + this(new StandardAnalyzer()); + } + + + @Override + public Analyzer getWrappedAnalyzer(String fieldName) { + return delegate; + } + + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + if(components instanceof AnnotatedTokenStreamComponents){ + // already wrapped. + return components; + } + AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream()); + return new AnnotatedTokenStreamComponents(components.getTokenizer(), injector); + } + } + + + //This Analyzer is not "wrappable" because of a limitation in Lucene https://issues.apache.org/jira/browse/LUCENE-8352 + private static final class AnnotatedTokenStreamComponents extends TokenStreamComponents{ + private AnnotationsInjector annotationsInjector; + + AnnotatedTokenStreamComponents(Tokenizer source, AnnotationsInjector annotationsInjector) { + super(source, annotationsInjector); + this.annotationsInjector = annotationsInjector; + } + + @Override + protected void setReader(Reader reader) { + // Sneaky code to change the content downstream components will parse. + // Replace the marked-up content Reader with a plain text Reader and prime the + // annotations injector with the AnnotatedTokens that need to be injected + // as plain-text parsing progresses. + AnnotatedText annotations = AnnotatedText.parse(readToString(reader)); + annotationsInjector.setAnnotations(annotations); + super.setReader(new StringReader(annotations.textMinusMarkup)); + } + } + + static String readToString(Reader reader) { + char[] arr = new char[8 * 1024]; + StringBuilder buffer = new StringBuilder(); + int numCharsRead; + try { + while ((numCharsRead = reader.read(arr, 0, arr.length)) != -1) { + buffer.append(arr, 0, numCharsRead); + } + reader.close(); + return buffer.toString(); + } catch (IOException e) { + throw new UncheckedIOException("IO Error reading field content", e); + } + } + + + public static final class AnnotationsInjector extends TokenFilter { + + private AnnotatedText annotatedText; + AnnotatedText.AnnotationToken nextAnnotationForInjection = null; + private int currentAnnotationIndex = 0; + List pendingStates = new ArrayList<>(); + int pendingStatePos = 0; + boolean inputExhausted = false; + + private final OffsetAttribute textOffsetAtt = addAttribute(OffsetAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + public AnnotationsInjector(TokenStream in) { + super(in); + } + + public void setAnnotations(AnnotatedText annotatedText) { + this.annotatedText = annotatedText; + currentAnnotationIndex = 0; + if(annotatedText!=null && annotatedText.numAnnotations()>0){ + nextAnnotationForInjection = annotatedText.getAnnotation(0); + } else { + nextAnnotationForInjection = null; + } + } + + + + @Override + public void reset() throws IOException { + pendingStates.clear(); + pendingStatePos = 0; + inputExhausted = false; + super.reset(); + } + + // Abstracts if we are pulling from some pre-cached buffer of + // text tokens or directly from the wrapped TokenStream + private boolean internalNextToken() throws IOException{ + if (pendingStatePos < pendingStates.size()){ + restoreState(pendingStates.get(pendingStatePos)); + pendingStatePos ++; + if(pendingStatePos >=pendingStates.size()){ + pendingStatePos =0; + pendingStates.clear(); + } + return true; + } + if(inputExhausted) { + return false; + } + return input.incrementToken(); + } + + @Override + public boolean incrementToken() throws IOException { + if (internalNextToken()) { + if (nextAnnotationForInjection != null) { + // If we are at the right point to inject an annotation.... + if (textOffsetAtt.startOffset() >= nextAnnotationForInjection.offset) { + int firstSpannedTextPosInc = posAtt.getPositionIncrement(); + int annotationPosLen = 1; + + // Capture the text token's state for later replay - but + // with a zero pos increment so is same as annotation + // that is injected before it + posAtt.setPositionIncrement(0); + pendingStates.add(captureState()); + + while (textOffsetAtt.endOffset() <= nextAnnotationForInjection.endOffset) { + // Buffer up all the other tokens spanned by this annotation to determine length. + if (input.incrementToken()) { + if (textOffsetAtt.endOffset() <= nextAnnotationForInjection.endOffset + && textOffsetAtt.startOffset() < nextAnnotationForInjection.endOffset) { + annotationPosLen += posAtt.getPositionIncrement(); + } + pendingStates.add(captureState()); + } else { + inputExhausted = true; + break; + } + } + emitAnnotation(firstSpannedTextPosInc, annotationPosLen); + return true; + } + } + return true; + } else { + inputExhausted = true; + return false; + } + } + private void setType(AnnotationToken token) { + //Default annotation type - in future AnnotationTokens may contain custom type info + typeAtt.setType("annotation"); + } + + private void emitAnnotation(int firstSpannedTextPosInc, int annotationPosLen) throws IOException { + // Set the annotation's attributes + posLenAtt.setPositionLength(annotationPosLen); + textOffsetAtt.setOffset(nextAnnotationForInjection.offset, nextAnnotationForInjection.endOffset); + setType(nextAnnotationForInjection); + + // We may have multiple annotations at this location - stack them up + final int annotationOffset = nextAnnotationForInjection.offset; + final AnnotatedText.AnnotationToken firstAnnotationAtThisPos = nextAnnotationForInjection; + while (nextAnnotationForInjection != null && nextAnnotationForInjection.offset == annotationOffset) { + + + setType(nextAnnotationForInjection); + termAtt.resizeBuffer(nextAnnotationForInjection.value.length()); + termAtt.copyBuffer(nextAnnotationForInjection.value.toCharArray(), 0, nextAnnotationForInjection.value.length()); + + if (nextAnnotationForInjection == firstAnnotationAtThisPos) { + posAtt.setPositionIncrement(firstSpannedTextPosInc); + //Put at the head of the queue of tokens to be emitted + pendingStates.add(0, captureState()); + } else { + posAtt.setPositionIncrement(0); + //Put after the head of the queue of tokens to be emitted + pendingStates.add(1, captureState()); + } + + + // Flag the inject annotation as null to prevent re-injection. + currentAnnotationIndex++; + if (currentAnnotationIndex < annotatedText.numAnnotations()) { + nextAnnotationForInjection = annotatedText.getAnnotation(currentAnnotationIndex); + } else { + nextAnnotationForInjection = null; + } + } + // Now pop the first of many potential buffered tokens: + internalNextToken(); + } + + } + + + public static final class AnnotatedTextFieldType extends StringFieldType { + + public AnnotatedTextFieldType() { + setTokenized(true); + } + + protected AnnotatedTextFieldType(AnnotatedTextFieldType ref) { + super(ref); + } + + @Override + public void setIndexAnalyzer(NamedAnalyzer delegate) { + if(delegate.analyzer() instanceof AnnotationAnalyzerWrapper){ + // Already wrapped the Analyzer with an AnnotationAnalyzer + super.setIndexAnalyzer(delegate); + } else { + // Wrap the analyzer with an AnnotationAnalyzer that will inject required annotations + super.setIndexAnalyzer(new NamedAnalyzer(delegate.name(), AnalyzerScope.INDEX, + new AnnotationAnalyzerWrapper(delegate.analyzer()))); + } + } + + public AnnotatedTextFieldType clone() { + return new AnnotatedTextFieldType(this); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public Query existsQuery(QueryShardContext context) { + if (omitNorms()) { + return new TermQuery(new Term(FieldNamesFieldMapper.NAME, name())); + } else { + return new NormsFieldExistsQuery(name()); + } + } + + @Override + public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException { + PhraseQuery.Builder builder = new PhraseQuery.Builder(); + builder.setSlop(slop); + + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + int position = -1; + + stream.reset(); + while (stream.incrementToken()) { + if (enablePosIncrements) { + position += posIncrAtt.getPositionIncrement(); + } + else { + position += 1; + } + builder.add(new Term(field, termAtt.getBytesRef()), position); + } + + return builder.build(); + } + + @Override + public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { + + MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); + mpqb.setSlop(slop); + + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + int position = -1; + + List multiTerms = new ArrayList<>(); + stream.reset(); + while (stream.incrementToken()) { + int positionIncrement = posIncrAtt.getPositionIncrement(); + + if (positionIncrement > 0 && multiTerms.size() > 0) { + if (enablePositionIncrements) { + mpqb.add(multiTerms.toArray(new Term[0]), position); + } else { + mpqb.add(multiTerms.toArray(new Term[0])); + } + multiTerms.clear(); + } + position += positionIncrement; + multiTerms.add(new Term(field, termAtt.getBytesRef())); + } + + if (enablePositionIncrements) { + mpqb.add(multiTerms.toArray(new Term[0]), position); + } else { + mpqb.add(multiTerms.toArray(new Term[0])); + } + return mpqb.build(); + } + } + + private int positionIncrementGap; + protected AnnotatedTextFieldMapper(String simpleName, AnnotatedTextFieldType fieldType, MappedFieldType defaultFieldType, + int positionIncrementGap, + Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); + assert fieldType.tokenized(); + assert fieldType.hasDocValues() == false; + this.positionIncrementGap = positionIncrementGap; + } + + @Override + protected AnnotatedTextFieldMapper clone() { + return (AnnotatedTextFieldMapper) super.clone(); + } + + public int getPositionIncrementGap() { + return this.positionIncrementGap; + } + + @Override + protected void parseCreateField(ParseContext context, List fields) throws IOException { + final String value; + if (context.externalValueSet()) { + value = context.externalValue().toString(); + } else { + value = context.parser().textOrNull(); + } + + if (value == null) { + return; + } + + if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) { + Field field = new Field(fieldType().name(), value, fieldType()); + fields.add(field); + if (fieldType().omitNorms()) { + createFieldNamesField(context, fields); + } + } + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + public AnnotatedTextFieldType fieldType() { + return (AnnotatedTextFieldType) super.fieldType(); + } + + @Override + protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { + super.doXContentBody(builder, includeDefaults, params); + doXContentAnalyzers(builder, includeDefaults); + + if (includeDefaults || positionIncrementGap != POSITION_INCREMENT_GAP_USE_ANALYZER) { + builder.field("position_increment_gap", positionIncrementGap); + } + } +} diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/plugin/mapper/AnnotatedTextPlugin.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/plugin/mapper/AnnotatedTextPlugin.java new file mode 100644 index 00000000000..c7abe5fb5f9 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/plugin/mapper/AnnotatedTextPlugin.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.mapper; + +import java.util.Collections; +import java.util.Map; + +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper; +import org.elasticsearch.plugins.MapperPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.plugins.SearchPlugin; +import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedTextHighlighter; +import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; + +public class AnnotatedTextPlugin extends Plugin implements MapperPlugin, SearchPlugin { + + @Override + public Map getMappers() { + return Collections.singletonMap(AnnotatedTextFieldMapper.CONTENT_TYPE, new AnnotatedTextFieldMapper.TypeParser()); + } + + @Override + public Map getHighlighters() { + return Collections.singletonMap(AnnotatedTextHighlighter.NAME, new AnnotatedTextHighlighter()); + } +} diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java new file mode 100644 index 00000000000..ad1acc85031 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java @@ -0,0 +1,201 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.uhighlight.Passage; +import org.apache.lucene.search.uhighlight.PassageFormatter; +import org.apache.lucene.search.uhighlight.Snippet; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +/** + * Custom passage formatter that : + * 1) marks up search hits in markdown-like syntax for URLs ({@link Snippet}) + * 2) injects any annotations from the original text that don't conflict with search hit highlighting + */ +public class AnnotatedPassageFormatter extends PassageFormatter { + + + public static final String SEARCH_HIT_TYPE = "_hit_term"; + private final Encoder encoder; + private AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer; + + public AnnotatedPassageFormatter(AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer, Encoder encoder) { + this.annotatedHighlighterAnalyzer = annotatedHighlighterAnalyzer; + this.encoder = encoder; + } + + static class MarkupPassage { + List markups = new ArrayList<>(); + int lastMarkupEnd = -1; + + public void addUnlessOverlapping(Markup newMarkup) { + + // Fast exit. + if(newMarkup.start > lastMarkupEnd) { + markups.add(newMarkup); + lastMarkupEnd = newMarkup.end; + return; + } + + // Check to see if this new markup overlaps with any prior + int index=0; + for (Markup existingMarkup: markups) { + if(existingMarkup.samePosition(newMarkup)) { + existingMarkup.merge(newMarkup); + return; + } + if(existingMarkup.overlaps(newMarkup)) { + // existing markup wins - we throw away the new markup that would span this position + return; + } + // markup list is in start offset order so we can insert at this position then shift others right + if(existingMarkup.isAfter(newMarkup)) { + markups.add(index, newMarkup); + return; + } + index++; + } + markups.add(newMarkup); + lastMarkupEnd = newMarkup.end; + } + + } + static class Markup { + int start; + int end; + String metadata; + Markup(int start, int end, String metadata) { + super(); + this.start = start; + this.end = end; + this.metadata = metadata; + } + boolean isAfter(Markup other) { + return start > other.end; + } + void merge(Markup newMarkup) { + // metadata is key1=value&key2=value&.... syntax used for urls + assert samePosition(newMarkup); + metadata += "&" + newMarkup.metadata; + } + boolean samePosition(Markup other) { + return this.start == other.start && this.end == other.end; + } + boolean overlaps(Markup other) { + return (start<=other.start && end >= other.start) + || (start <= other.end && end >=other.end) + || (start>=other.start && end<=other.end); + } + @Override + public String toString() { + return "Markup [start=" + start + ", end=" + end + ", metadata=" + metadata + "]"; + } + + + } + // Merge original annotations and search hits into a single set of markups for each passage + static MarkupPassage mergeAnnotations(AnnotationToken [] annotations, Passage passage){ + try { + MarkupPassage markupPassage = new MarkupPassage(); + + // Add search hits first - they take precedence over any other markup + for (int i = 0; i < passage.getNumMatches(); i++) { + int start = passage.getMatchStarts()[i]; + int end = passage.getMatchEnds()[i]; + String searchTerm = passage.getMatchTerms()[i].utf8ToString(); + Markup markup = new Markup(start, end, SEARCH_HIT_TYPE+"="+URLEncoder.encode(searchTerm, StandardCharsets.UTF_8.name())); + markupPassage.addUnlessOverlapping(markup); + } + + // Now add original text's annotations - ignoring any that might conflict with the search hits markup. + for (AnnotationToken token: annotations) { + int start = token.offset; + int end = token.endOffset; + if(start >= passage.getStartOffset() && end<=passage.getEndOffset()) { + String escapedValue = URLEncoder.encode(token.value, StandardCharsets.UTF_8.name()); + Markup markup = new Markup(start, end, escapedValue); + markupPassage.addUnlessOverlapping(markup); + } + } + return markupPassage; + + } catch (UnsupportedEncodingException e) { + // We should always have UTF-8 support + throw new IllegalStateException(e); + } + } + + + @Override + public Snippet[] format(Passage[] passages, String content) { + Snippet[] snippets = new Snippet[passages.length]; + + int pos; + int j = 0; + for (Passage passage : passages) { + AnnotationToken [] annotations = annotatedHighlighterAnalyzer.getIntersectingAnnotations(passage.getStartOffset(), + passage.getEndOffset()); + MarkupPassage mergedMarkup = mergeAnnotations(annotations, passage); + + StringBuilder sb = new StringBuilder(); + pos = passage.getStartOffset(); + for(Markup markup: mergedMarkup.markups) { + int start = markup.start; + int end = markup.end; + // its possible to have overlapping terms + if (start > pos) { + append(sb, content, pos, start); + } + if (end > pos) { + sb.append("["); + append(sb, content, Math.max(pos, start), end); + + sb.append("]("); + sb.append(markup.metadata); + sb.append(")"); + pos = end; + } + } + // its possible a "term" from the analyzer could span a sentence boundary. + append(sb, content, pos, Math.max(pos, passage.getEndOffset())); + //we remove the paragraph separator if present at the end of the snippet (we used it as separator between values) + if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) { + sb.deleteCharAt(sb.length() - 1); + } else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) { + sb.deleteCharAt(sb.length() - 1); + } + //and we trim the snippets too + snippets[j++] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0); + } + return snippets; + } + + private void append(StringBuilder dest, String content, int start, int end) { + dest.append(encoder.encodeText(content.substring(start, end))); + } +} diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java new file mode 100644 index 00000000000..d93316c7892 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java @@ -0,0 +1,64 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.uhighlight.PassageFormatter; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer; +import org.elasticsearch.search.fetch.FetchSubPhase.HitContext; +import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +public class AnnotatedTextHighlighter extends UnifiedHighlighter { + + public static final String NAME = "annotated"; + + AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer = null; + + @Override + protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) { + annotatedHighlighterAnalyzer = new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper, type)); + return annotatedHighlighterAnalyzer; + } + + // Convert the marked-up values held on-disk to plain-text versions for highlighting + @Override + protected List loadFieldValues(MappedFieldType fieldType, Field field, SearchContext context, HitContext hitContext) + throws IOException { + List fieldValues = super.loadFieldValues(fieldType, field, context, hitContext); + String[] fieldValuesAsString = fieldValues.toArray(new String[fieldValues.size()]); + annotatedHighlighterAnalyzer.init(fieldValuesAsString); + return Arrays.asList((Object[]) annotatedHighlighterAnalyzer.getPlainTextValuesForHighlighter()); + } + + @Override + protected PassageFormatter getPassageFormatter(SearchContextHighlight.Field field, Encoder encoder) { + return new AnnotatedPassageFormatter(annotatedHighlighterAnalyzer, encoder); + + } + +} diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextClientYamlTestSuiteIT.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextClientYamlTestSuiteIT.java new file mode 100644 index 00000000000..3d643b2a7ca --- /dev/null +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextClientYamlTestSuiteIT.java @@ -0,0 +1,39 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.annotatedtext; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; +import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; + +public class AnnotatedTextClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase { + + public AnnotatedTextClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) { + super(testCandidate); + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return createParameters(); + } +} + diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java new file mode 100644 index 00000000000..8a51b9a494b --- /dev/null +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java @@ -0,0 +1,681 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.annotatedtext; + +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.termvectors.TermVectorsRequest; +import org.elasticsearch.action.termvectors.TermVectorsResponse; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.lucene.uid.Versions; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.VersionType; +import org.elasticsearch.index.engine.Engine; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.DocumentMapperParser; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.MapperService.MergeReason; +import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.index.mapper.TextFieldMapper; +import org.elasticsearch.index.shard.IndexShard; +import org.elasticsearch.index.termvectors.TermVectorsService; +import org.elasticsearch.indices.IndicesService; +import org.elasticsearch.plugin.mapper.AnnotatedTextPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESSingleNodeTestCase; +import org.junit.Before; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.notNullValue; + +public class AnnotatedTextFieldMapperTests extends ESSingleNodeTestCase { + + IndexService indexService; + DocumentMapperParser parser; + + @Before + public void setup() { + Settings settings = Settings.builder() + .put("index.analysis.filter.mySynonyms.type", "synonym") + .putList("index.analysis.filter.mySynonyms.synonyms", Collections.singletonList("car, auto")) + .put("index.analysis.analyzer.synonym.tokenizer", "standard") + .put("index.analysis.analyzer.synonym.filter", "mySynonyms") + // Stop filter remains in server as it is part of lucene-core + .put("index.analysis.analyzer.my_stop_analyzer.tokenizer", "standard") + .put("index.analysis.analyzer.my_stop_analyzer.filter", "stop") + .build(); + indexService = createIndex("test", settings); + parser = indexService.mapperService().documentMapperParser(); + } + + + + @Override + protected Collection> getPlugins() { + List> classpathPlugins = new ArrayList<>(); + classpathPlugins.add(AnnotatedTextPlugin.class); + return classpathPlugins; + } + + + + protected String getFieldType() { + return "annotated_text"; + } + + public void testAnnotationInjection() throws IOException { + + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = indexService.mapperService().merge("type", + new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE); + + // Use example of typed and untyped annotations + String annotatedText = "He paid [Stormy Daniels](Stephanie+Clifford&Payee) hush money"; + SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field", annotatedText) + .endObject()), + XContentType.JSON); + ParsedDocument doc = mapper.parse(sourceToParse); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + + assertEquals(annotatedText, fields[0].stringValue()); + + IndexShard shard = indexService.getShard(0); + shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL, + sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false); + shard.refresh("test"); + try (Engine.Searcher searcher = shard.acquireSearcher("test")) { + LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); + TermsEnum terms = leaf.terms("field").iterator(); + + assertTrue(terms.seekExact(new BytesRef("stormy"))); + PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(2, postings.nextPosition()); + + assertTrue(terms.seekExact(new BytesRef("Stephanie Clifford"))); + postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(2, postings.nextPosition()); + + assertTrue(terms.seekExact(new BytesRef("Payee"))); + postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(2, postings.nextPosition()); + + + assertTrue(terms.seekExact(new BytesRef("hush"))); + postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(4, postings.nextPosition()); + + } + } + + public void testToleranceForBadAnnotationMarkup() throws IOException { + + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = indexService.mapperService().merge("type", + new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE); + + String annotatedText = "foo [bar](MissingEndBracket baz"; + SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field", annotatedText) + .endObject()), + XContentType.JSON); + ParsedDocument doc = mapper.parse(sourceToParse); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + + assertEquals(annotatedText, fields[0].stringValue()); + + IndexShard shard = indexService.getShard(0); + shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL, + sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false); + shard.refresh("test"); + try (Engine.Searcher searcher = shard.acquireSearcher("test")) { + LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); + TermsEnum terms = leaf.terms("field").iterator(); + + assertTrue(terms.seekExact(new BytesRef("foo"))); + PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(0, postings.nextPosition()); + + assertTrue(terms.seekExact(new BytesRef("bar"))); + postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(1, postings.nextPosition()); + + assertFalse(terms.seekExact(new BytesRef("MissingEndBracket"))); + // Bad markup means value is treated as plain text and fed through tokenisation + assertTrue(terms.seekExact(new BytesRef("missingendbracket"))); + + } + } + + public void testAgainstTermVectorsAPI() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("tvfield").field("type", getFieldType()) + .field("term_vector", "with_positions_offsets_payloads") + .endObject().endObject() + .endObject().endObject()); + indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE); + + + int max = between(3, 10); + BulkRequestBuilder bulk = client().prepareBulk(); + for (int i = 0; i < max; i++) { + bulk.add(client().prepareIndex("test", "type", Integer.toString(i)) + .setSource("tvfield", "the quick [brown](Color) fox jumped over the lazy dog")); + } + bulk.get(); + + TermVectorsRequest request = new TermVectorsRequest("test", "type", "0").termStatistics(true); + + IndicesService indicesService = getInstanceFromNode(IndicesService.class); + IndexService test = indicesService.indexService(resolveIndex("test")); + IndexShard shard = test.getShardOrNull(0); + assertThat(shard, notNullValue()); + TermVectorsResponse response = TermVectorsService.getTermVectors(shard, request); + assertEquals(1, response.getFields().size()); + + Terms terms = response.getFields().terms("tvfield"); + TermsEnum iterator = terms.iterator(); + BytesRef term; + Set foundTerms = new HashSet<>(); + while ((term = iterator.next()) != null) { + foundTerms.add(term.utf8ToString()); + } + //Check we have both text and annotation tokens + assertTrue(foundTerms.contains("brown")); + assertTrue(foundTerms.contains("Color")); + assertTrue(foundTerms.contains("fox")); + + } + + // ===== Code below copied from TextFieldMapperTests ======== + + public void testDefaults() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject()), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + + assertEquals("1234", fields[0].stringValue()); + IndexableFieldType fieldType = fields[0].fieldType(); + assertThat(fieldType.omitNorms(), equalTo(false)); + assertTrue(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)); + assertThat(fieldType.storeTermVectors(), equalTo(false)); + assertThat(fieldType.storeTermVectorOffsets(), equalTo(false)); + assertThat(fieldType.storeTermVectorPositions(), equalTo(false)); + assertThat(fieldType.storeTermVectorPayloads(), equalTo(false)); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + } + + public void testEnableStore() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", getFieldType()).field("store", true).endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject()), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + assertTrue(fields[0].fieldType().stored()); + } + + public void testDisableNorms() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", getFieldType()) + .field("norms", false) + .endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject()), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + assertTrue(fields[0].fieldType().omitNorms()); + } + + public void testIndexOptions() throws IOException { + Map supportedOptions = new HashMap<>(); + supportedOptions.put("docs", IndexOptions.DOCS); + supportedOptions.put("freqs", IndexOptions.DOCS_AND_FREQS); + supportedOptions.put("positions", IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + supportedOptions.put("offsets", IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + + XContentBuilder mappingBuilder = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties"); + for (String option : supportedOptions.keySet()) { + mappingBuilder.startObject(option).field("type", getFieldType()).field("index_options", option).endObject(); + } + String mapping = Strings.toString(mappingBuilder.endObject().endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + XContentBuilder jsonDoc = XContentFactory.jsonBuilder().startObject(); + for (String option : supportedOptions.keySet()) { + jsonDoc.field(option, "1234"); + } + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference.bytes(jsonDoc.endObject()), + XContentType.JSON)); + + for (Map.Entry entry : supportedOptions.entrySet()) { + String field = entry.getKey(); + IndexOptions options = entry.getValue(); + IndexableField[] fields = doc.rootDoc().getFields(field); + assertEquals(1, fields.length); + assertEquals(options, fields[0].fieldType().indexOptions()); + } + } + + public void testDefaultPositionIncrementGap() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = indexService.mapperService().merge("type", + new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE); + + assertEquals(mapping, mapper.mappingSource().toString()); + + SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .array("field", new String[] {"a", "b"}) + .endObject()), + XContentType.JSON); + ParsedDocument doc = mapper.parse(sourceToParse); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + assertEquals("a", fields[0].stringValue()); + assertEquals("b", fields[1].stringValue()); + + IndexShard shard = indexService.getShard(0); + shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL, + sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false); + shard.refresh("test"); + try (Engine.Searcher searcher = shard.acquireSearcher("test")) { + LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); + TermsEnum terms = leaf.terms("field").iterator(); + assertTrue(terms.seekExact(new BytesRef("b"))); + PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition()); + } + } + + public void testPositionIncrementGap() throws IOException { + final int positionIncrementGap = randomIntBetween(1, 1000); + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", getFieldType()) + .field("position_increment_gap", positionIncrementGap) + .endObject().endObject() + .endObject().endObject()); + + DocumentMapper mapper = indexService.mapperService().merge("type", + new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE); + + assertEquals(mapping, mapper.mappingSource().toString()); + + SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .array("field", new String[]{"a", "b"}) + .endObject()), + XContentType.JSON); + ParsedDocument doc = mapper.parse(sourceToParse); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + assertEquals("a", fields[0].stringValue()); + assertEquals("b", fields[1].stringValue()); + + IndexShard shard = indexService.getShard(0); + shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL, + sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false); + shard.refresh("test"); + try (Engine.Searcher searcher = shard.acquireSearcher("test")) { + LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); + TermsEnum terms = leaf.terms("field").iterator(); + assertTrue(terms.seekExact(new BytesRef("b"))); + PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); + assertEquals(0, postings.nextDoc()); + assertEquals(positionIncrementGap + 1, postings.nextPosition()); + } + } + + public void testSearchAnalyzerSerialization() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "standard") + .field("search_analyzer", "keyword") + .endObject() + .endObject().endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + // special case: default index analyzer + mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "default") + .field("search_analyzer", "keyword") + .endObject() + .endObject().endObject().endObject()); + + mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "keyword") + .endObject() + .endObject().endObject().endObject()); + + mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + // special case: default search analyzer + mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "keyword") + .field("search_analyzer", "default") + .endObject() + .endObject().endObject().endObject()); + + mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "keyword") + .endObject() + .endObject().endObject().endObject()); + mapper = parser.parse("type", new CompressedXContent(mapping)); + + XContentBuilder builder = XContentFactory.jsonBuilder(); + builder.startObject(); + mapper.toXContent(builder, new ToXContent.MapParams(Collections.singletonMap("include_defaults", "true"))); + builder.endObject(); + + String mappingString = Strings.toString(builder); + assertTrue(mappingString.contains("analyzer")); + assertTrue(mappingString.contains("search_analyzer")); + assertTrue(mappingString.contains("search_quote_analyzer")); + } + + public void testSearchQuoteAnalyzerSerialization() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "standard") + .field("search_analyzer", "standard") + .field("search_quote_analyzer", "keyword") + .endObject() + .endObject().endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + // special case: default index/search analyzer + mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", "default") + .field("search_analyzer", "default") + .field("search_quote_analyzer", "keyword") + .endObject() + .endObject().endObject().endObject()); + + mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + } + + public void testTermVectors() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field1") + .field("type", getFieldType()) + .field("term_vector", "no") + .endObject() + .startObject("field2") + .field("type", getFieldType()) + .field("term_vector", "yes") + .endObject() + .startObject("field3") + .field("type", getFieldType()) + .field("term_vector", "with_offsets") + .endObject() + .startObject("field4") + .field("type", getFieldType()) + .field("term_vector", "with_positions") + .endObject() + .startObject("field5") + .field("type", getFieldType()) + .field("term_vector", "with_positions_offsets") + .endObject() + .startObject("field6") + .field("type", getFieldType()) + .field("term_vector", "with_positions_offsets_payloads") + .endObject() + .endObject() + .endObject().endObject()); + + DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping)); + + ParsedDocument doc = defaultMapper.parse(SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field1", "1234") + .field("field2", "1234") + .field("field3", "1234") + .field("field4", "1234") + .field("field5", "1234") + .field("field6", "1234") + .endObject()), + XContentType.JSON)); + + assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectors(), equalTo(false)); + assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectorOffsets(), equalTo(false)); + assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectorPositions(), equalTo(false)); + assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectorPayloads(), equalTo(false)); + + assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectors(), equalTo(true)); + assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectorOffsets(), equalTo(false)); + assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectorPositions(), equalTo(false)); + assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectorPayloads(), equalTo(false)); + + assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectors(), equalTo(true)); + assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectorOffsets(), equalTo(true)); + assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectorPositions(), equalTo(false)); + assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectorPayloads(), equalTo(false)); + + assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectors(), equalTo(true)); + assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectorOffsets(), equalTo(false)); + assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectorPositions(), equalTo(true)); + assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectorPayloads(), equalTo(false)); + + assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectors(), equalTo(true)); + assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectorOffsets(), equalTo(true)); + assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectorPositions(), equalTo(true)); + assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectorPayloads(), equalTo(false)); + + assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectors(), equalTo(true)); + assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectorOffsets(), equalTo(true)); + assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectorPositions(), equalTo(true)); + assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectorPayloads(), equalTo(true)); + } + + public void testNullConfigValuesFail() throws MapperParsingException, IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject() + .startObject("type") + .startObject("properties") + .startObject("field") + .field("type", getFieldType()) + .field("analyzer", (String) null) + .endObject() + .endObject() + .endObject().endObject()); + + Exception e = expectThrows(MapperParsingException.class, () -> parser.parse("type", new CompressedXContent(mapping))); + assertEquals("[analyzer] must not have a [null] value", e.getMessage()); + } + + public void testNotIndexedField() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", getFieldType()) + .field("index", false) + .endObject().endObject().endObject().endObject()); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> parser.parse("type", new CompressedXContent(mapping))); + assertEquals("[annotated_text] fields must be indexed", e.getMessage()); + } + + public void testAnalyzedFieldPositionIncrementWithoutPositions() throws IOException { + for (String indexOptions : Arrays.asList("docs", "freqs")) { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", getFieldType()) + .field("index_options", indexOptions) + .field("position_increment_gap", 10) + .endObject().endObject().endObject().endObject()); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> parser.parse("type", new CompressedXContent(mapping))); + assertEquals("Cannot set position_increment_gap on field [field] without positions enabled", e.getMessage()); + } + } + + public void testEmptyName() throws IOException { + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject() + .startObject("type") + .startObject("properties") + .startObject("") + .field("type", getFieldType()) + .endObject() + .endObject() + .endObject().endObject()); + + // Empty name not allowed in index created after 5.0 + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> parser.parse("type", new CompressedXContent(mapping)) + ); + assertThat(e.getMessage(), containsString("name cannot be empty string")); + } + + + +} diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextParsingTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextParsingTests.java new file mode 100644 index 00000000000..4df44df5cd5 --- /dev/null +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextParsingTests.java @@ -0,0 +1,73 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.annotatedtext; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken; +import org.elasticsearch.test.ESTestCase; + +import java.util.List; + +import static org.hamcrest.Matchers.equalTo; + +public class AnnotatedTextParsingTests extends ESTestCase { + + private void checkParsing(String markup, String expectedPlainText, AnnotationToken... expectedTokens) { + AnnotatedText at = AnnotatedText.parse(markup); + assertEquals(expectedPlainText, at.textMinusMarkup); + List actualAnnotations = at.annotations; + assertEquals(expectedTokens.length, actualAnnotations.size()); + for (int i = 0; i < expectedTokens.length; i++) { + assertEquals(expectedTokens[i], actualAnnotations.get(i)); + } + } + + public void testSingleValueMarkup() { + checkParsing("foo [bar](Y)", "foo bar", new AnnotationToken(4,7,"Y")); + } + + public void testMultiValueMarkup() { + checkParsing("foo [bar](Y&B)", "foo bar", new AnnotationToken(4,7,"Y"), + new AnnotationToken(4,7,"B")); + } + + public void testBlankTextAnnotation() { + checkParsing("It sounded like this:[](theSoundOfOneHandClapping)", "It sounded like this:", + new AnnotationToken(21,21,"theSoundOfOneHandClapping")); + } + + public void testMissingBracket() { + checkParsing("[foo](MissingEndBracket bar", + "[foo](MissingEndBracket bar", new AnnotationToken[0]); + } + + public void testAnnotationWithType() { + Exception expectedException = expectThrows(ElasticsearchParseException.class, + () -> checkParsing("foo [bar](type=foo) baz", "foo bar baz", new AnnotationToken(4,7, "noType"))); + assertThat(expectedException.getMessage(), equalTo("key=value pairs are not supported in annotations")); + } + + public void testMissingValue() { + checkParsing("[foo]() bar", "foo bar", new AnnotationToken[0]); + } + + +} diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java new file mode 100644 index 00000000000..2fcf917ab1d --- /dev/null +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java @@ -0,0 +1,185 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.highlight.DefaultEncoder; +import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator; +import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter; +import org.apache.lucene.search.uhighlight.PassageFormatter; +import org.apache.lucene.search.uhighlight.Snippet; +import org.apache.lucene.search.uhighlight.SplittingBreakIterator; +import org.apache.lucene.store.Directory; +import org.elasticsearch.common.Strings; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer; +import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper; +import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedPassageFormatter; +import org.elasticsearch.test.ESTestCase; + +import java.net.URLEncoder; +import java.text.BreakIterator; +import java.util.Locale; + +import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; +import static org.hamcrest.CoreMatchers.equalTo; + +public class AnnotatedTextHighlighterTests extends ESTestCase { + + private void assertHighlightOneDoc(String fieldName, String []markedUpInputs, + Query query, Locale locale, BreakIterator breakIterator, + int noMatchSize, String[] expectedPassages) throws Exception { + + // Annotated fields wrap the usual analyzer with one that injects extra tokens + Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer()); + AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer); + hiliteAnalyzer.init(markedUpInputs); + PassageFormatter passageFormatter = new AnnotatedPassageFormatter(hiliteAnalyzer,new DefaultEncoder()); + String []plainTextForHighlighter = hiliteAnalyzer.getPlainTextValuesForHighlighter(); + + + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + FieldType ft = new FieldType(TextField.TYPE_STORED); + if (randomBoolean()) { + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + } else { + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + } + ft.freeze(); + Document doc = new Document(); + for (String input : markedUpInputs) { + Field field = new Field(fieldName, "", ft); + field.setStringValue(input); + doc.add(field); + } + iw.addDocument(doc); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER); + assertThat(topDocs.totalHits.value, equalTo(1L)); + String rawValue = Strings.arrayToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR)); + + CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, hiliteAnalyzer, null, + passageFormatter, locale, + breakIterator, rawValue, noMatchSize); + highlighter.setFieldMatcher((name) -> "text".equals(name)); + final Snippet[] snippets = + highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length); + assertEquals(expectedPassages.length, snippets.length); + for (int i = 0; i < snippets.length; i++) { + assertEquals(expectedPassages[i], snippets[i].getText()); + } + reader.close(); + dir.close(); + } + + + public void testAnnotatedTextStructuredMatch() throws Exception { + // Check that a structured token eg a URL can be highlighted in a query + // on marked-up + // content using an "annotated_text" type field. + String url = "https://en.wikipedia.org/wiki/Key_Word_in_Context"; + String encodedUrl = URLEncoder.encode(url, "UTF-8"); + String annotatedWord = "[highlighting](" + encodedUrl + ")"; + String highlightedAnnotatedWord = "[highlighting](" + AnnotatedPassageFormatter.SEARCH_HIT_TYPE + "=" + encodedUrl + "&" + + encodedUrl + ")"; + final String[] markedUpInputs = { "This is a test. Just a test1 " + annotatedWord + " from [annotated](bar) highlighter.", + "This is the second " + annotatedWord + " value to perform highlighting on a longer text that gets scored lower." }; + + String[] expectedPassages = { + "This is a test. Just a test1 " + highlightedAnnotatedWord + " from [annotated](bar) highlighter.", + "This is the second " + highlightedAnnotatedWord + " value to perform highlighting on a" + + " longer text that gets scored lower." }; + Query query = new TermQuery(new Term("text", url)); + BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages); + } + + public void testAnnotatedTextOverlapsWithUnstructuredSearchTerms() throws Exception { + final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore", + "Donald duck is a [Disney](Disney+Inc) invention" }; + + String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", + "[Donald](_hit_term=donald) duck is a [Disney](Disney+Inc) invention" }; + Query query = new TermQuery(new Term("text", "donald")); + BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages); + } + + public void testAnnotatedTextMultiFieldWithBreakIterator() throws Exception { + final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald", + "Donald duck is a [Disney](Disney+Inc) invention" }; + String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", + "Kim shook hands with [Donald](_hit_term=donald)", + "[Donald](_hit_term=donald) duck is a [Disney](Disney+Inc) invention" }; + Query query = new TermQuery(new Term("text", "donald")); + BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + breakIterator = new SplittingBreakIterator(breakIterator, '.'); + assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages); + } + + public void testAnnotatedTextSingleFieldWithBreakIterator() throws Exception { + final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald"}; + String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", + "Kim shook hands with [Donald](_hit_term=donald)"}; + Query query = new TermQuery(new Term("text", "donald")); + BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + breakIterator = new SplittingBreakIterator(breakIterator, '.'); + assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages); + } + + public void testAnnotatedTextSingleFieldWithPhraseQuery() throws Exception { + final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore", + "Donald Jr was with Melania Trump"}; + String[] expectedPassages = { "[Donald](_hit_term=donald) [Trump](_hit_term=trump) visited Singapore"}; + Query query = new PhraseQuery("text", "donald", "trump"); + BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages); + } + + public void testBadAnnotation() throws Exception { + final String[] markedUpInputs = { "Missing bracket for [Donald Trump](Donald+Trump visited Singapore"}; + String[] expectedPassages = { "Missing bracket for [Donald Trump](Donald+Trump visited [Singapore](_hit_term=singapore)"}; + Query query = new TermQuery(new Term("text", "singapore")); + BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages); + } + +} diff --git a/plugins/mapper-annotated-text/src/test/resources/rest-api-spec/test/mapper_annotatedtext/10_basic.yml b/plugins/mapper-annotated-text/src/test/resources/rest-api-spec/test/mapper_annotatedtext/10_basic.yml new file mode 100644 index 00000000000..64e0b863bf9 --- /dev/null +++ b/plugins/mapper-annotated-text/src/test/resources/rest-api-spec/test/mapper_annotatedtext/10_basic.yml @@ -0,0 +1,44 @@ +# Integration tests for Mapper Annotated_text components +# + +--- +"annotated highlighter on annotated text": + - skip: + version: " - 6.99.99" + reason: Annotated text type introduced in 7.0.0-alpha1 + + - do: + indices.create: + index: annotated + body: + settings: + number_of_shards: "1" + number_of_replicas: "0" + mappings: + doc: + properties: + text: + type: annotated_text + entityID: + type: keyword + + - do: + index: + index: annotated + type: doc + body: + "text" : "The [quick brown fox](entity_3789) is brown." + "entityID": "entity_3789" + refresh: true + + - do: + search: + body: { "query" : {"term" : { "entityID" : "entity_3789" } }, "highlight" : { "type" : "annotated", "require_field_match": false, "fields" : { "text" : {} } } } + + - match: {hits.hits.0.highlight.text.0: "The [quick brown fox](_hit_term=entity_3789&entity_3789) is brown."} + + - do: + search: + body: { "query" : {"term" : { "text" : "quick" } }, "highlight" : { "type" : "annotated", "require_field_match": false, "fields" : { "text" : {} } } } + + - match: {hits.hits.0.highlight.text.0: "The [quick](_hit_term=quick) brown fox is brown."} diff --git a/qa/vagrant/src/test/resources/packaging/tests/module_and_plugin_test_cases.bash b/qa/vagrant/src/test/resources/packaging/tests/module_and_plugin_test_cases.bash index 8fd6bd9ad3f..7aeb03851a5 100644 --- a/qa/vagrant/src/test/resources/packaging/tests/module_and_plugin_test_cases.bash +++ b/qa/vagrant/src/test/resources/packaging/tests/module_and_plugin_test_cases.bash @@ -266,6 +266,10 @@ fi install_and_check_plugin mapper murmur3 } +@test "[$GROUP] install annotated-text mapper plugin" { + install_and_check_plugin mapper annotated-text +} + @test "[$GROUP] check reindex module" { check_module reindex } @@ -380,6 +384,10 @@ fi remove_plugin mapper-murmur3 } +@test "[$GROUP] remove annotated-text mapper plugin" { + remove_plugin mapper-annotated-text +} + @test "[$GROUP] remove size mapper plugin" { remove_plugin mapper-size } diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java index c1c42fb45a4..6ae302ee87a 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java @@ -18,10 +18,13 @@ */ package org.elasticsearch.search.fetch.subphase.highlight; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.SimpleHTMLEncoder; import org.elasticsearch.index.fieldvisitor.CustomFieldsVisitor; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.internal.SearchContext; @@ -70,8 +73,18 @@ public final class HighlightUtils { return textsToHighlight; } - static class Encoders { - static final Encoder DEFAULT = new DefaultEncoder(); - static final Encoder HTML = new SimpleHTMLEncoder(); + public static class Encoders { + public static final Encoder DEFAULT = new DefaultEncoder(); + public static final Encoder HTML = new SimpleHTMLEncoder(); } + + static Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) { + if (type instanceof KeywordFieldMapper.KeywordFieldType) { + KeywordFieldMapper.KeywordFieldType keywordFieldType = (KeywordFieldMapper.KeywordFieldType) type; + if (keywordFieldType.normalizer() != null) { + return keywordFieldType.normalizer(); + } + } + return docMapper.mappers().indexAnalyzer(); + } } diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java index 1ac3f4789cb..ec5071706b0 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java @@ -49,7 +49,6 @@ import java.util.List; import java.util.Map; import static org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter.convertFieldValue; -import static org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter.getAnalyzer; public class PlainHighlighter implements Highlighter { private static final String CACHE_KEY = "highlight-plain"; @@ -102,7 +101,7 @@ public class PlainHighlighter implements Highlighter { int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments(); ArrayList fragsList = new ArrayList<>(); List textsToHighlight; - Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType); + Analyzer analyzer = HighlightUtils.getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType); final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset(); try { diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java index 2c9d482cab0..123e18a4da6 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java @@ -26,6 +26,7 @@ import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner; import org.apache.lucene.search.uhighlight.CustomPassageFormatter; import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator; import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter; +import org.apache.lucene.search.uhighlight.PassageFormatter; import org.apache.lucene.search.uhighlight.Snippet; import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource; import org.apache.lucene.util.BytesRef; @@ -34,7 +35,6 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.text.Text; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.DocumentMapper; -import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchSubPhase; @@ -54,7 +54,7 @@ public class UnifiedHighlighter implements Highlighter { public boolean canHighlight(MappedFieldType fieldType) { return true; } - + @Override public HighlightField highlight(HighlighterContext highlighterContext) { MappedFieldType fieldType = highlighterContext.fieldType; @@ -62,23 +62,18 @@ public class UnifiedHighlighter implements Highlighter { SearchContext context = highlighterContext.context; FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], - field.fieldOptions().postTags()[0], encoder); final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset(); List snippets = new ArrayList<>(); int numberOfFragments; try { - final Analyzer analyzer = - getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType); - List fieldValues = HighlightUtils.loadFieldValues(field, fieldType, context, hitContext); - fieldValues = fieldValues.stream() - .map((s) -> convertFieldValue(fieldType, s)) - .collect(Collectors.toList()); + final Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType); + List fieldValues = loadFieldValues(fieldType, field, context, hitContext); if (fieldValues.size() == 0) { return null; } + final PassageFormatter passageFormatter = getPassageFormatter(field, encoder); final IndexSearcher searcher = new IndexSearcher(hitContext.reader()); final CustomUnifiedHighlighter highlighter; final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); @@ -145,7 +140,27 @@ public class UnifiedHighlighter implements Highlighter { return null; } - private BreakIterator getBreakIterator(SearchContextHighlight.Field field) { + protected PassageFormatter getPassageFormatter(SearchContextHighlight.Field field, Encoder encoder) { + CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], + field.fieldOptions().postTags()[0], encoder); + return passageFormatter; + } + + + protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) { + return HighlightUtils.getAnalyzer(docMapper, type); + } + + protected List loadFieldValues(MappedFieldType fieldType, SearchContextHighlight.Field field, SearchContext context, + FetchSubPhase.HitContext hitContext) throws IOException { + List fieldValues = HighlightUtils.loadFieldValues(field, fieldType, context, hitContext); + fieldValues = fieldValues.stream() + .map((s) -> convertFieldValue(fieldType, s)) + .collect(Collectors.toList()); + return fieldValues; + } + + protected BreakIterator getBreakIterator(SearchContextHighlight.Field field) { final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions(); final Locale locale = fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() : @@ -168,7 +183,7 @@ public class UnifiedHighlighter implements Highlighter { } } - private static List filterSnippets(List snippets, int numberOfFragments) { + protected static List filterSnippets(List snippets, int numberOfFragments) { //We need to filter the snippets as due to no_match_size we could have //either highlighted snippets or non highlighted ones and we don't want to mix those up @@ -203,17 +218,7 @@ public class UnifiedHighlighter implements Highlighter { return filteredSnippets; } - static Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) { - if (type instanceof KeywordFieldMapper.KeywordFieldType) { - KeywordFieldMapper.KeywordFieldType keywordFieldType = (KeywordFieldMapper.KeywordFieldType) type; - if (keywordFieldType.normalizer() != null) { - return keywordFieldType.normalizer(); - } - } - return docMapper.mappers().indexAnalyzer(); - } - - static String convertFieldValue(MappedFieldType type, Object value) { + protected static String convertFieldValue(MappedFieldType type, Object value) { if (value instanceof BytesRef) { return type.valueForDisplay(value).toString(); } else { @@ -221,14 +226,14 @@ public class UnifiedHighlighter implements Highlighter { } } - private static String mergeFieldValues(List fieldValues, char valuesSeparator) { + protected static String mergeFieldValues(List fieldValues, char valuesSeparator) { //postings highlighter accepts all values in a single string, as offsets etc. need to match with content //loaded from stored fields, we merge all values using a proper separator String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator)); return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1)); } - private OffsetSource getOffsetSource(MappedFieldType fieldType) { + protected OffsetSource getOffsetSource(MappedFieldType fieldType) { if (fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { return fieldType.storeTermVectors() ? OffsetSource.POSTINGS_WITH_TERM_VECTORS : OffsetSource.POSTINGS; }