Merge branch 'master' into hlclient/add-delete-method

This commit is contained in:
David Pilato 2017-02-24 09:23:03 +01:00
commit 3e4b917066
22 changed files with 797 additions and 1251 deletions

View File

@ -140,6 +140,7 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
@ -225,6 +226,7 @@ public final class AnalysisModule {
tokenFilters.register("snowball", SnowballTokenFilterFactory::new); tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new); tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new); tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new); tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);

View File

@ -51,6 +51,7 @@ import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@ -87,6 +88,18 @@ public enum PreBuiltTokenFilters {
} }
}, },
WORD_DELIMITER_GRAPH(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new WordDelimiterGraphFilter(tokenStream,
WordDelimiterGraphFilter.GENERATE_WORD_PARTS |
WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS |
WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE |
WordDelimiterGraphFilter.SPLIT_ON_NUMERICS |
WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
}
},
STOP(CachingStrategy.LUCENE) { STOP(CachingStrategy.LUCENE) {
@Override @Override
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.elasticsearch.Version;
import org.elasticsearch.action.support.ToXContentToBytes; import org.elasticsearch.action.support.ToXContentToBytes;
import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.ParsingException;
@ -32,10 +33,12 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.function.BiFunction; import java.util.function.BiFunction;
@ -57,8 +60,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public static final ParseField NUMBER_OF_FRAGMENTS_FIELD = new ParseField("number_of_fragments"); public static final ParseField NUMBER_OF_FRAGMENTS_FIELD = new ParseField("number_of_fragments");
public static final ParseField ENCODER_FIELD = new ParseField("encoder"); public static final ParseField ENCODER_FIELD = new ParseField("encoder");
public static final ParseField REQUIRE_FIELD_MATCH_FIELD = new ParseField("require_field_match"); public static final ParseField REQUIRE_FIELD_MATCH_FIELD = new ParseField("require_field_match");
public static final ParseField BOUNDARY_SCANNER_FIELD = new ParseField("boundary_scanner");
public static final ParseField BOUNDARY_MAX_SCAN_FIELD = new ParseField("boundary_max_scan"); public static final ParseField BOUNDARY_MAX_SCAN_FIELD = new ParseField("boundary_max_scan");
public static final ParseField BOUNDARY_CHARS_FIELD = new ParseField("boundary_chars"); public static final ParseField BOUNDARY_CHARS_FIELD = new ParseField("boundary_chars");
public static final ParseField BOUNDARY_SCANNER_LOCALE_FIELD = new ParseField("boundary_scanner_locale");
public static final ParseField TYPE_FIELD = new ParseField("type"); public static final ParseField TYPE_FIELD = new ParseField("type");
public static final ParseField FRAGMENTER_FIELD = new ParseField("fragmenter"); public static final ParseField FRAGMENTER_FIELD = new ParseField("fragmenter");
public static final ParseField NO_MATCH_SIZE_FIELD = new ParseField("no_match_size"); public static final ParseField NO_MATCH_SIZE_FIELD = new ParseField("no_match_size");
@ -88,10 +93,14 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
protected Boolean forceSource; protected Boolean forceSource;
protected BoundaryScannerType boundaryScannerType;
protected Integer boundaryMaxScan; protected Integer boundaryMaxScan;
protected char[] boundaryChars; protected char[] boundaryChars;
protected Locale boundaryScannerLocale;
protected Integer noMatchSize; protected Integer noMatchSize;
protected Integer phraseLimit; protected Integer phraseLimit;
@ -119,10 +128,18 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
order(in.readOptionalWriteable(Order::readFromStream)); order(in.readOptionalWriteable(Order::readFromStream));
highlightFilter(in.readOptionalBoolean()); highlightFilter(in.readOptionalBoolean());
forceSource(in.readOptionalBoolean()); forceSource(in.readOptionalBoolean());
if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
boundaryScannerType(in.readOptionalWriteable(BoundaryScannerType::readFromStream));
}
boundaryMaxScan(in.readOptionalVInt()); boundaryMaxScan(in.readOptionalVInt());
if (in.readBoolean()) { if (in.readBoolean()) {
boundaryChars(in.readString().toCharArray()); boundaryChars(in.readString().toCharArray());
} }
if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
if (in.readBoolean()) {
boundaryScannerLocale(in.readString());
}
}
noMatchSize(in.readOptionalVInt()); noMatchSize(in.readOptionalVInt());
phraseLimit(in.readOptionalVInt()); phraseLimit(in.readOptionalVInt());
if (in.readBoolean()) { if (in.readBoolean()) {
@ -150,12 +167,22 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
out.writeOptionalWriteable(order); out.writeOptionalWriteable(order);
out.writeOptionalBoolean(highlightFilter); out.writeOptionalBoolean(highlightFilter);
out.writeOptionalBoolean(forceSource); out.writeOptionalBoolean(forceSource);
if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
out.writeOptionalWriteable(boundaryScannerType);
}
out.writeOptionalVInt(boundaryMaxScan); out.writeOptionalVInt(boundaryMaxScan);
boolean hasBounaryChars = boundaryChars != null; boolean hasBounaryChars = boundaryChars != null;
out.writeBoolean(hasBounaryChars); out.writeBoolean(hasBounaryChars);
if (hasBounaryChars) { if (hasBounaryChars) {
out.writeString(String.valueOf(boundaryChars)); out.writeString(String.valueOf(boundaryChars));
} }
if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
boolean hasBoundaryScannerLocale = boundaryScannerLocale != null;
out.writeBoolean(hasBoundaryScannerLocale);
if (hasBoundaryScannerLocale) {
out.writeString(boundaryScannerLocale.toLanguageTag());
}
}
out.writeOptionalVInt(noMatchSize); out.writeOptionalVInt(noMatchSize);
out.writeOptionalVInt(phraseLimit); out.writeOptionalVInt(phraseLimit);
boolean hasOptions = options != null; boolean hasOptions = options != null;
@ -331,6 +358,33 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
return this.highlightFilter; return this.highlightFilter;
} }
/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls which scanner to use for fragment boundaries, and defaults to "simple".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerType(String boundaryScannerType) {
this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType);
return (HB) this;
}
/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls which scanner to use for fragment boundaries, and defaults to "simple".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) {
this.boundaryScannerType = boundaryScannerType;
return (HB) this;
}
/**
* @return the value set by {@link #boundaryScannerType(String)}
*/
public BoundaryScannerType boundaryScannerType() {
return this.boundaryScannerType;
}
/** /**
* When using the highlighterType <tt>fvh</tt> this setting * When using the highlighterType <tt>fvh</tt> this setting
* controls how far to look for boundary characters, and defaults to 20. * controls how far to look for boundary characters, and defaults to 20.
@ -366,6 +420,25 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
return this.boundaryChars; return this.boundaryChars;
} }
/**
* When using the highlighterType <tt>fvh</tt> and boundaryScannerType <tt>break_iterator</tt>, this setting
* controls the locale to use by the BreakIterator, defaults to "root".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerLocale(String boundaryScannerLocale) {
if (boundaryScannerLocale != null) {
this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale);
}
return (HB) this;
}
/**
* @return the value set by {@link #boundaryScannerLocale(String)}
*/
public Locale boundaryScannerLocale() {
return this.boundaryScannerLocale;
}
/** /**
* Allows to set custom options for custom highlighters. * Allows to set custom options for custom highlighters.
*/ */
@ -491,12 +564,18 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
if (highlightFilter != null) { if (highlightFilter != null) {
builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter); builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter);
} }
if (boundaryScannerType != null) {
builder.field(BOUNDARY_SCANNER_FIELD.getPreferredName(), boundaryScannerType.name());
}
if (boundaryMaxScan != null) { if (boundaryMaxScan != null) {
builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan); builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan);
} }
if (boundaryChars != null) { if (boundaryChars != null) {
builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars)); builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars));
} }
if (boundaryScannerLocale != null) {
builder.field(BOUNDARY_SCANNER_LOCALE_FIELD.getPreferredName(), boundaryScannerLocale.toLanguageTag());
}
if (options != null && options.size() > 0) { if (options != null && options.size() > 0) {
builder.field(OPTIONS_FIELD.getPreferredName(), options); builder.field(OPTIONS_FIELD.getPreferredName(), options);
} }
@ -523,8 +602,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
parser.declareInt(HB::fragmentSize, FRAGMENT_SIZE_FIELD); parser.declareInt(HB::fragmentSize, FRAGMENT_SIZE_FIELD);
parser.declareInt(HB::numOfFragments, NUMBER_OF_FRAGMENTS_FIELD); parser.declareInt(HB::numOfFragments, NUMBER_OF_FRAGMENTS_FIELD);
parser.declareBoolean(HB::requireFieldMatch, REQUIRE_FIELD_MATCH_FIELD); parser.declareBoolean(HB::requireFieldMatch, REQUIRE_FIELD_MATCH_FIELD);
parser.declareString(HB::boundaryScannerType, BOUNDARY_SCANNER_FIELD);
parser.declareInt(HB::boundaryMaxScan, BOUNDARY_MAX_SCAN_FIELD); parser.declareInt(HB::boundaryMaxScan, BOUNDARY_MAX_SCAN_FIELD);
parser.declareString((HB hb, String bc) -> hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD); parser.declareString((HB hb, String bc) -> hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD);
parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD);
parser.declareString(HB::highlighterType, TYPE_FIELD); parser.declareString(HB::highlighterType, TYPE_FIELD);
parser.declareString(HB::fragmenter, FRAGMENTER_FIELD); parser.declareString(HB::fragmenter, FRAGMENTER_FIELD);
parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD); parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
@ -562,8 +643,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public final int hashCode() { public final int hashCode() {
return Objects.hash(getClass(), Arrays.hashCode(preTags), Arrays.hashCode(postTags), fragmentSize, return Objects.hash(getClass(), Arrays.hashCode(preTags), Arrays.hashCode(postTags), fragmentSize,
numOfFragments, highlighterType, fragmenter, highlightQuery, order, highlightFilter, numOfFragments, highlighterType, fragmenter, highlightQuery, order, highlightFilter,
forceSource, boundaryMaxScan, Arrays.hashCode(boundaryChars), noMatchSize, forceSource, boundaryScannerType, boundaryMaxScan, Arrays.hashCode(boundaryChars), boundaryScannerLocale,
phraseLimit, options, requireFieldMatch, doHashCode()); noMatchSize, phraseLimit, options, requireFieldMatch, doHashCode());
} }
/** /**
@ -591,8 +672,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
Objects.equals(order, other.order) && Objects.equals(order, other.order) &&
Objects.equals(highlightFilter, other.highlightFilter) && Objects.equals(highlightFilter, other.highlightFilter) &&
Objects.equals(forceSource, other.forceSource) && Objects.equals(forceSource, other.forceSource) &&
Objects.equals(boundaryScannerType, other.boundaryScannerType) &&
Objects.equals(boundaryMaxScan, other.boundaryMaxScan) && Objects.equals(boundaryMaxScan, other.boundaryMaxScan) &&
Arrays.equals(boundaryChars, other.boundaryChars) && Arrays.equals(boundaryChars, other.boundaryChars) &&
Objects.equals(boundaryScannerLocale, other.boundaryScannerLocale) &&
Objects.equals(noMatchSize, other.noMatchSize) && Objects.equals(noMatchSize, other.noMatchSize) &&
Objects.equals(phraseLimit, other.phraseLimit) && Objects.equals(phraseLimit, other.phraseLimit) &&
Objects.equals(options, other.options) && Objects.equals(options, other.options) &&

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder; import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.CustomFieldQuery; import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldFragList; import org.apache.lucene.search.vectorhighlight.FieldFragList;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
@ -38,15 +39,23 @@ import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.internal.SearchContext;
import java.text.BreakIterator;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
public class FastVectorHighlighter implements Highlighter { public class FastVectorHighlighter implements Highlighter {
private static final SimpleBoundaryScanner DEFAULT_BOUNDARY_SCANNER = new SimpleBoundaryScanner(); private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getSentenceInstance(Locale.ROOT));
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getWordInstance(Locale.ROOT));
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
true, Setting.Property.NodeScope); true, Setting.Property.NodeScope);
@ -105,12 +114,7 @@ public class FastVectorHighlighter implements Highlighter {
FragListBuilder fragListBuilder; FragListBuilder fragListBuilder;
BaseFragmentsBuilder fragmentsBuilder; BaseFragmentsBuilder fragmentsBuilder;
BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER; final BoundaryScanner boundaryScanner = getBoundaryScanner(field);
if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(),
field.fieldOptions().boundaryChars());
}
boolean forceSource = context.highlight().forceSource(field); boolean forceSource = context.highlight().forceSource(field);
if (field.fieldOptions().numberOfFragments() == 0) { if (field.fieldOptions().numberOfFragments() == 0) {
fragListBuilder = new SingleFragListBuilder(); fragListBuilder = new SingleFragListBuilder();
@ -206,6 +210,29 @@ public class FastVectorHighlighter implements Highlighter {
&& fieldMapper.fieldType().storeTermVectorPositions(); && fieldMapper.fieldType().storeTermVectorPositions();
} }
private static BoundaryScanner getBoundaryScanner(Field field) {
final FieldOptions fieldOptions = field.fieldOptions();
final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
switch(fieldOptions.boundaryScannerType()) {
case SENTENCE:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
}
return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
case WORD:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
}
return DEFAULT_WORD_BOUNDARY_SCANNER;
default:
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
}
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
}
}
private class MapperHighlightEntry { private class MapperHighlightEntry {
public FragListBuilder fragListBuilder; public FragListBuilder fragListBuilder;
public FragmentsBuilder fragmentsBuilder; public FragmentsBuilder fragmentsBuilder;

View File

@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED) .preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH) .highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE) .forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER) .numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) .boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
.noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build(); .boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
private final List<Field> fields = new ArrayList<>(); private final List<Field> fields = new ArrayList<>();
@ -327,12 +327,18 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
if (highlighterBuilder.requireFieldMatch != null) { if (highlighterBuilder.requireFieldMatch != null) {
targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch); targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch);
} }
if (highlighterBuilder.boundaryScannerType != null) {
targetOptionsBuilder.boundaryScannerType(highlighterBuilder.boundaryScannerType);
}
if (highlighterBuilder.boundaryMaxScan != null) { if (highlighterBuilder.boundaryMaxScan != null) {
targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan); targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan);
} }
if (highlighterBuilder.boundaryChars != null) { if (highlighterBuilder.boundaryChars != null) {
targetOptionsBuilder.boundaryChars(convertCharArray(highlighterBuilder.boundaryChars)); targetOptionsBuilder.boundaryChars(convertCharArray(highlighterBuilder.boundaryChars));
} }
if (highlighterBuilder.boundaryScannerLocale != null) {
targetOptionsBuilder.boundaryScannerLocale(highlighterBuilder.boundaryScannerLocale);
}
if (highlighterBuilder.highlighterType != null) { if (highlighterBuilder.highlighterType != null) {
targetOptionsBuilder.highlighterType(highlighterBuilder.highlighterType); targetOptionsBuilder.highlighterType(highlighterBuilder.highlighterType);
} }
@ -522,4 +528,30 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
return name().toLowerCase(Locale.ROOT); return name().toLowerCase(Locale.ROOT);
} }
} }
public enum BoundaryScannerType implements Writeable {
CHARS, WORD, SENTENCE;
public static BoundaryScannerType readFromStream(StreamInput in) throws IOException {
int ordinal = in.readVInt();
if (ordinal < 0 || ordinal >= values().length) {
throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]");
}
return values()[ordinal];
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(this.ordinal());
}
public static BoundaryScannerType fromString(String boundaryScannerType) {
return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT));
}
@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}
} }

View File

@ -20,11 +20,13 @@
package org.elasticsearch.search.fetch.subphase.highlight; package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -110,10 +112,14 @@ public class SearchContextHighlight {
private String fragmenter; private String fragmenter;
private BoundaryScannerType boundaryScannerType;
private int boundaryMaxScan = -1; private int boundaryMaxScan = -1;
private Character[] boundaryChars = null; private Character[] boundaryChars = null;
private Locale boundaryScannerLocale;
private Query highlightQuery; private Query highlightQuery;
private int noMatchSize = -1; private int noMatchSize = -1;
@ -168,6 +174,10 @@ public class SearchContextHighlight {
return fragmenter; return fragmenter;
} }
public BoundaryScannerType boundaryScannerType() {
return boundaryScannerType;
}
public int boundaryMaxScan() { public int boundaryMaxScan() {
return boundaryMaxScan; return boundaryMaxScan;
} }
@ -176,6 +186,10 @@ public class SearchContextHighlight {
return boundaryChars; return boundaryChars;
} }
public Locale boundaryScannerLocale() {
return boundaryScannerLocale;
}
public Query highlightQuery() { public Query highlightQuery() {
return highlightQuery; return highlightQuery;
} }
@ -260,6 +274,11 @@ public class SearchContextHighlight {
return this; return this;
} }
Builder boundaryScannerType(BoundaryScannerType boundaryScanner) {
fieldOptions.boundaryScannerType = boundaryScanner;
return this;
}
Builder boundaryMaxScan(int boundaryMaxScan) { Builder boundaryMaxScan(int boundaryMaxScan) {
fieldOptions.boundaryMaxScan = boundaryMaxScan; fieldOptions.boundaryMaxScan = boundaryMaxScan;
return this; return this;
@ -270,6 +289,11 @@ public class SearchContextHighlight {
return this; return this;
} }
Builder boundaryScannerLocale(Locale boundaryScannerLocale) {
fieldOptions.boundaryScannerLocale = boundaryScannerLocale;
return this;
}
Builder highlightQuery(Query highlightQuery) { Builder highlightQuery(Query highlightQuery) {
fieldOptions.highlightQuery = highlightQuery; fieldOptions.highlightQuery = highlightQuery;
return this; return this;
@ -324,12 +348,18 @@ public class SearchContextHighlight {
if (fieldOptions.requireFieldMatch == null) { if (fieldOptions.requireFieldMatch == null) {
fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch; fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch;
} }
if (fieldOptions.boundaryScannerType == null) {
fieldOptions.boundaryScannerType = globalOptions.boundaryScannerType;
}
if (fieldOptions.boundaryMaxScan == -1) { if (fieldOptions.boundaryMaxScan == -1) {
fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan; fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan;
} }
if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) { if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) {
fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length); fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length);
} }
if (fieldOptions.boundaryScannerLocale == null) {
fieldOptions.boundaryScannerLocale = globalOptions.boundaryScannerLocale;
}
if (fieldOptions.highlighterType == null) { if (fieldOptions.highlighterType == null) {
fieldOptions.highlighterType = globalOptions.highlighterType; fieldOptions.highlighterType = globalOptions.highlighterType;
} }

View File

@ -0,0 +1,146 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import java.io.IOException;
import java.io.StringReader;
/**
* Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory}
*/
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
final String type;
public BaseWordDelimiterTokenFilterFactoryTestCase(String type) {
this.type = type;
}
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
"fi", "4000", "j", "2", "se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateWords() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateNumbers() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
"se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateAll() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testSplitOnCaseChange() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"PowerShot"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
"wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testStemEnglishPossessive() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
"se", "O", "Neil", "s"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
}

View File

@ -0,0 +1,75 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.io.StringReader;
public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterGraphTokenFilterFactoryTests() {
super("word_delimiter_graph");
}
public void testMultiTerms() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
"ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
}

View File

@ -24,118 +24,20 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
public class WordDelimiterTokenFilterFactoryTests extends ESTokenStreamTestCase { public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
public void testDefault() throws IOException { public WordDelimiterTokenFilterFactoryTests() {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() super("word_delimiter");
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateWords() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateNumbers() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateAll() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testSplitOnCaseChange() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"PowerShot"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testStemEnglishPossessive() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
} }
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException { public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build()); .build());

View File

@ -47,6 +47,7 @@ import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.SearchModule; import org.elasticsearch.search.SearchModule;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions; import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
@ -288,6 +289,7 @@ public class HighlightBuilderTests extends ESTestCase {
mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions); mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions);
checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars); checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars);
checkSame.accept(AbstractHighlighterBuilder::boundaryScannerType, FieldOptions::boundaryScannerType);
checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan); checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan);
checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize); checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize);
checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter); checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter);
@ -557,12 +559,23 @@ public class HighlightBuilderTests extends ESTestCase {
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.forceSource(randomBoolean()); highlightBuilder.forceSource(randomBoolean());
} }
if (randomBoolean()) {
if (randomBoolean()) {
highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()));
} else {
// also test the string setter
highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()).toString());
}
}
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10)); highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10));
} }
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray()); highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray());
} }
if (randomBoolean()) {
highlightBuilder.boundaryScannerLocale(randomLocale(random()).toLanguageTag());
}
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.noMatchSize(randomIntBetween(0, 10)); highlightBuilder.noMatchSize(randomIntBetween(0, 10));
} }

View File

@ -44,6 +44,7 @@ import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.rest.RestStatus; import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
@ -57,6 +58,7 @@ import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import static org.elasticsearch.client.Requests.searchRequest; import static org.elasticsearch.client.Requests.searchRequest;
@ -747,7 +749,94 @@ public class HighlighterSearchIT extends ESIntegTestCase {
searchResponse = client().prepareSearch("test").setSource(source).get(); searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over")); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
}
public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "sentence"))
.highlighter(highlight()
.field("field1", 20, 2)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.SENTENCE));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
}
public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "sentence"))
.highlighter(highlight()
.field("field1", 20, 2)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.SENTENCE)
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
}
public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "some"))
.highlighter(highlight()
.field("field1", 23, 1)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.WORD));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
}
public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "some"))
.highlighter(highlight()
.field("field1", 23, 1)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.WORD)
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
} }
/** /**

View File

@ -81,6 +81,7 @@ buildRestTests.expectedUnconvertedCandidates = [
'reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc',
'reference/cat/snapshots.asciidoc', 'reference/cat/snapshots.asciidoc',
'reference/cat/templates.asciidoc', 'reference/cat/templates.asciidoc',
'reference/cat/thread_pool.asciidoc', 'reference/cat/thread_pool.asciidoc',

View File

@ -0,0 +1,97 @@
[[analysis-word-delimiter-graph-tokenfilter]]
=== Word Delimiter Graph Token Filter
experimental[]
Named `word_delimiter_graph`, it splits words into subwords and performs
optional transformations on subword groups. Words are split into
subwords with the following rules:
* split on intra-word delimiters (by default, all non alpha-numeric
characters).
* "Wi-Fi" -> "Wi", "Fi"
* split on case transitions: "PowerShot" -> "Power", "Shot"
* split on letter-number transitions: "SD500" -> "SD", "500"
* leading and trailing intra-word delimiters on each subword are
ignored: "//hello---there, 'dude'" -> "hello", "there", "dude"
* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil"
Unlike the `word_delimiter`, this token filter correctly handles positions for
multi terms expansion at search-time when any of the following options
are set to true:
* `preserve_original`
* `catenate_numbers`
* `catenate_words`
* `catenate_all`
Parameters include:
`generate_word_parts`::
If `true` causes parts of words to be
generated: "PowerShot" => "Power" "Shot". Defaults to `true`.
`generate_number_parts`::
If `true` causes number subwords to be
generated: "500-42" => "500" "42". Defaults to `true`.
`catenate_words`::
If `true` causes maximum runs of word parts to be
catenated: "wi-fi" => "wifi". Defaults to `false`.
`catenate_numbers`::
If `true` causes maximum runs of number parts to
be catenated: "500-42" => "50042". Defaults to `false`.
`catenate_all`::
If `true` causes all subword parts to be catenated:
"wi-fi-4000" => "wifi4000". Defaults to `false`.
`split_on_case_change`::
If `true` causes "PowerShot" to be two tokens;
("Power-Shot" remains two parts regards). Defaults to `true`.
`preserve_original`::
If `true` includes original words in subwords:
"500-42" => "500-42" "500" "42". Defaults to `false`.
`split_on_numerics`::
If `true` causes "j2se" to be three tokens; "j"
"2" "se". Defaults to `true`.
`stem_english_possessive`::
If `true` causes trailing "'s" to be
removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`.
Advance settings include:
`protected_words`::
A list of protected words from being delimiter.
Either an array, or also can set `protected_words_path` which resolved
to a file configured with protected words (one on each line).
Automatically resolves to `config/` based location if exists.
`type_table`::
A custom type mapping table, for example (when configured
using `type_table_path`):
[source,js]
--------------------------------------------------
# Map the $, %, '.', and ',' characters to DIGIT
# This might be useful for financial data.
$ => DIGIT
% => DIGIT
. => DIGIT
\\u002C => DIGIT
# in some cases you might not want to split on ZWJ
# this also tests the case where we need a bigger byte[]
# see http://en.wikipedia.org/wiki/Zero-width_joiner
\\u200D => ALPHANUM
--------------------------------------------------
NOTE: Using a tokenizer like the `standard` tokenizer may interfere with
the `catenate_*` and `preserve_original` parameters, as the original
string may already have lost punctuation during tokenization. Instead,
you may want to use the `whitespace` tokenizer.

View File

@ -103,8 +103,7 @@ If `term_vector` information is provided by setting `term_vector` to
will be used instead of the plain highlighter. The fast vector highlighter: will be used instead of the plain highlighter. The fast vector highlighter:
* Is faster especially for large fields (> `1MB`) * Is faster especially for large fields (> `1MB`)
* Can be customized with `boundary_chars`, `boundary_max_scan`, and * Can be customized with `boundary_scanner` (see <<boundary-scanners,below>>)
`fragment_offset` (see <<boundary-characters,below>>)
* Requires setting `term_vector` to `with_positions_offsets` which * Requires setting `term_vector` to `with_positions_offsets` which
increases the size of the index increases the size of the index
* Can combine matches from multiple fields into one result. See * Can combine matches from multiple fields into one result. See
@ -502,17 +501,23 @@ GET /_search
-------------------------------------------------- --------------------------------------------------
// CONSOLE // CONSOLE
[[boundary-characters]] [[boundary-scanners]]
==== Boundary Characters ==== Boundary Scanners
When highlighting a field using the fast vector highlighter, When highlighting a field using the fast vector highlighter, you can specify
`boundary_chars` can be configured to define what constitutes a boundary how to break the highlighted fragments using `boundary_scanner`, which accepts
for highlighting. It's a single string with each boundary character the following values:
defined in it. It defaults to `.,!? \t\n`.
The `boundary_max_scan` allows to control how far to look for boundary * `chars` (default): allows to configure which characters (`boundary_chars`)
characters, and defaults to `20`. constitute a boundary for highlighting. It's a single string with each boundary
character defined in it (defaults to `.,!? \t\n`). It also allows configuring
the `boundary_max_scan` to control how far to look for boundary characters
(defaults to `20`).
* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
to break the highlighted fragments at the next _word_ or _sentence_ boundary.
You can further specify `boundary_scanner_locale` to control which Locale is used
to search the text for these boundaries.
[[matched-fields]] [[matched-fields]]
==== Matched Fields ==== Matched Fields

View File

@ -0,0 +1,62 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.ingest.geoip;
import com.maxmind.geoip2.DatabaseReader;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.common.CheckedSupplier;
import org.elasticsearch.common.logging.Loggers;
import java.io.Closeable;
import java.io.IOException;
/**
* Facilitates lazy loading of the database reader, so that when the geoip plugin is installed, but not used,
* no memory is being wasted on the database reader.
*/
final class DatabaseReaderLazyLoader implements Closeable {
private static final Logger LOGGER = Loggers.getLogger(DatabaseReaderLazyLoader.class);
private final String databaseFileName;
private final CheckedSupplier<DatabaseReader, IOException> loader;
// package protected for testing only:
final SetOnce<DatabaseReader> databaseReader;
DatabaseReaderLazyLoader(String databaseFileName, CheckedSupplier<DatabaseReader, IOException> loader) {
this.databaseFileName = databaseFileName;
this.loader = loader;
this.databaseReader = new SetOnce<>();
}
synchronized DatabaseReader get() throws IOException {
if (databaseReader.get() == null) {
databaseReader.set(loader.get());
LOGGER.debug("Loaded [{}] geoip database", databaseFileName);
}
return databaseReader.get();
}
@Override
public synchronized void close() throws IOException {
IOUtils.close(databaseReader.get());
}
}

View File

@ -19,19 +19,6 @@
package org.elasticsearch.ingest.geoip; package org.elasticsearch.ingest.geoip;
import java.io.IOException;
import java.net.InetAddress;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import com.maxmind.geoip2.DatabaseReader; import com.maxmind.geoip2.DatabaseReader;
import com.maxmind.geoip2.exception.AddressNotFoundException; import com.maxmind.geoip2.exception.AddressNotFoundException;
import com.maxmind.geoip2.model.CityResponse; import com.maxmind.geoip2.model.CityResponse;
@ -49,6 +36,19 @@ import org.elasticsearch.ingest.AbstractProcessor;
import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.IngestDocument;
import org.elasticsearch.ingest.Processor; import org.elasticsearch.ingest.Processor;
import java.io.IOException;
import java.net.InetAddress;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import static org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException; import static org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException;
import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty; import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty;
import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList; import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList;
@ -264,9 +264,9 @@ public final class GeoIpProcessor extends AbstractProcessor {
); );
static final Set<Property> DEFAULT_COUNTRY_PROPERTIES = EnumSet.of(Property.CONTINENT_NAME, Property.COUNTRY_ISO_CODE); static final Set<Property> DEFAULT_COUNTRY_PROPERTIES = EnumSet.of(Property.CONTINENT_NAME, Property.COUNTRY_ISO_CODE);
private final Map<String, DatabaseReader> databaseReaders; private final Map<String, DatabaseReaderLazyLoader> databaseReaders;
public Factory(Map<String, DatabaseReader> databaseReaders) { public Factory(Map<String, DatabaseReaderLazyLoader> databaseReaders) {
this.databaseReaders = databaseReaders; this.databaseReaders = databaseReaders;
} }
@ -279,12 +279,13 @@ public final class GeoIpProcessor extends AbstractProcessor {
List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties"); List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties");
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
DatabaseReader databaseReader = databaseReaders.get(databaseFile); DatabaseReaderLazyLoader lazyLoader = databaseReaders.get(databaseFile);
if (databaseReader == null) { if (lazyLoader == null) {
throw newConfigurationException(TYPE, processorTag, throw newConfigurationException(TYPE, processorTag,
"database_file", "database file [" + databaseFile + "] doesn't exist"); "database_file", "database file [" + databaseFile + "] doesn't exist");
} }
DatabaseReader databaseReader = lazyLoader.get();
String databaseType = databaseReader.getMetadata().getDatabaseType(); String databaseType = databaseReader.getMetadata().getDatabaseType();
final Set<Property> properties; final Set<Property> properties;

View File

@ -19,6 +19,15 @@
package org.elasticsearch.ingest.geoip; package org.elasticsearch.ingest.geoip;
import com.maxmind.db.NoCache;
import com.maxmind.db.NodeCache;
import com.maxmind.geoip2.DatabaseReader;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.ingest.Processor;
import org.elasticsearch.plugins.IngestPlugin;
import org.elasticsearch.plugins.Plugin;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -35,20 +44,11 @@ import java.util.Map;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import com.maxmind.db.NoCache;
import com.maxmind.db.NodeCache;
import com.maxmind.geoip2.DatabaseReader;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.ingest.Processor;
import org.elasticsearch.plugins.IngestPlugin;
import org.elasticsearch.plugins.Plugin;
public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable { public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable {
public static final Setting<Long> CACHE_SIZE = public static final Setting<Long> CACHE_SIZE =
Setting.longSetting("ingest.geoip.cache_size", 1000, 0, Setting.Property.NodeScope); Setting.longSetting("ingest.geoip.cache_size", 1000, 0, Setting.Property.NodeScope);
private Map<String, DatabaseReader> databaseReaders; private Map<String, DatabaseReaderLazyLoader> databaseReaders;
@Override @Override
public List<Setting<?>> getSettings() { public List<Setting<?>> getSettings() {
@ -76,12 +76,12 @@ public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable
return Collections.singletonMap(GeoIpProcessor.TYPE, new GeoIpProcessor.Factory(databaseReaders)); return Collections.singletonMap(GeoIpProcessor.TYPE, new GeoIpProcessor.Factory(databaseReaders));
} }
static Map<String, DatabaseReader> loadDatabaseReaders(Path geoIpConfigDirectory, NodeCache cache) throws IOException { static Map<String, DatabaseReaderLazyLoader> loadDatabaseReaders(Path geoIpConfigDirectory, NodeCache cache) throws IOException {
if (Files.exists(geoIpConfigDirectory) == false && Files.isDirectory(geoIpConfigDirectory)) { if (Files.exists(geoIpConfigDirectory) == false && Files.isDirectory(geoIpConfigDirectory)) {
throw new IllegalStateException("the geoip directory [" + geoIpConfigDirectory + "] containing databases doesn't exist"); throw new IllegalStateException("the geoip directory [" + geoIpConfigDirectory + "] containing databases doesn't exist");
} }
Map<String, DatabaseReader> databaseReaders = new HashMap<>(); Map<String, DatabaseReaderLazyLoader> databaseReaders = new HashMap<>();
try (Stream<Path> databaseFiles = Files.list(geoIpConfigDirectory)) { try (Stream<Path> databaseFiles = Files.list(geoIpConfigDirectory)) {
PathMatcher pathMatcher = geoIpConfigDirectory.getFileSystem().getPathMatcher("glob:**.mmdb.gz"); PathMatcher pathMatcher = geoIpConfigDirectory.getFileSystem().getPathMatcher("glob:**.mmdb.gz");
// Use iterator instead of forEach otherwise IOException needs to be caught twice... // Use iterator instead of forEach otherwise IOException needs to be caught twice...
@ -89,10 +89,13 @@ public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable
while (iterator.hasNext()) { while (iterator.hasNext()) {
Path databasePath = iterator.next(); Path databasePath = iterator.next();
if (Files.isRegularFile(databasePath) && pathMatcher.matches(databasePath)) { if (Files.isRegularFile(databasePath) && pathMatcher.matches(databasePath)) {
String databaseFileName = databasePath.getFileName().toString();
DatabaseReaderLazyLoader holder = new DatabaseReaderLazyLoader(databaseFileName, () -> {
try (InputStream inputStream = new GZIPInputStream(Files.newInputStream(databasePath, StandardOpenOption.READ))) { try (InputStream inputStream = new GZIPInputStream(Files.newInputStream(databasePath, StandardOpenOption.READ))) {
databaseReaders.put(databasePath.getFileName().toString(), return new DatabaseReader.Builder(inputStream).withCache(cache).build();
new DatabaseReader.Builder(inputStream).withCache(cache).build());
} }
});
databaseReaders.put(databaseFileName, holder);
} }
} }
} }

View File

@ -22,7 +22,6 @@ package org.elasticsearch.ingest.geoip;
import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.maxmind.db.NoCache; import com.maxmind.db.NoCache;
import com.maxmind.db.NodeCache; import com.maxmind.db.NodeCache;
import com.maxmind.geoip2.DatabaseReader;
import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.Randomness; import org.elasticsearch.common.Randomness;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
@ -48,7 +47,7 @@ import static org.hamcrest.Matchers.sameInstance;
public class GeoIpProcessorFactoryTests extends ESTestCase { public class GeoIpProcessorFactoryTests extends ESTestCase {
private static Map<String, DatabaseReader> databaseReaders; private static Map<String, DatabaseReaderLazyLoader> databaseReaders;
@BeforeClass @BeforeClass
public static void loadDatabaseReaders() throws IOException { public static void loadDatabaseReaders() throws IOException {
@ -66,7 +65,7 @@ public class GeoIpProcessorFactoryTests extends ESTestCase {
@AfterClass @AfterClass
public static void closeDatabaseReaders() throws IOException { public static void closeDatabaseReaders() throws IOException {
for (DatabaseReader reader : databaseReaders.values()) { for (DatabaseReaderLazyLoader reader : databaseReaders.values()) {
reader.close(); reader.close();
} }
databaseReaders = null; databaseReaders = null;
@ -222,4 +221,37 @@ public class GeoIpProcessorFactoryTests extends ESTestCase {
assertThat(e.getMessage(), equalTo("[properties] property isn't a list, but of type [java.lang.String]")); assertThat(e.getMessage(), equalTo("[properties] property isn't a list, but of type [java.lang.String]"));
} }
} }
public void testLazyLoading() throws Exception {
Path configDir = createTempDir();
Path geoIpConfigDir = configDir.resolve("ingest-geoip");
Files.createDirectories(geoIpConfigDir);
Files.copy(new ByteArrayInputStream(StreamsUtils.copyToBytesFromClasspath("/GeoLite2-City.mmdb.gz")),
geoIpConfigDir.resolve("GeoLite2-City.mmdb.gz"));
Files.copy(new ByteArrayInputStream(StreamsUtils.copyToBytesFromClasspath("/GeoLite2-Country.mmdb.gz")),
geoIpConfigDir.resolve("GeoLite2-Country.mmdb.gz"));
// Loading another database reader instances, because otherwise we can't test lazy loading as the the
// database readers used at class level are reused between tests. (we want to keep that otherwise running this
// test will take roughly 4 times more time)
Map<String, DatabaseReaderLazyLoader> databaseReaders =
IngestGeoIpPlugin.loadDatabaseReaders(geoIpConfigDir, NoCache.getInstance());
GeoIpProcessor.Factory factory = new GeoIpProcessor.Factory(databaseReaders);
for (DatabaseReaderLazyLoader lazyLoader : databaseReaders.values()) {
assertNull(lazyLoader.databaseReader.get());
}
Map<String, Object> config = new HashMap<>();
config.put("field", "_field");
config.put("database_file", "GeoLite2-City.mmdb.gz");
factory.create(null, "_tag", config);
config = new HashMap<>();
config.put("field", "_field");
config.put("database_file", "GeoLite2-Country.mmdb.gz");
factory.create(null, "_tag", config);
for (DatabaseReaderLazyLoader lazyLoader : databaseReaders.values()) {
assertNotNull(lazyLoader.databaseReader.get());
}
}
} }

View File

@ -150,18 +150,7 @@ public class InternalAwsS3Service extends AbstractLifecycleComponent implements
if (key.length() == 0 && secret.length() == 0) { if (key.length() == 0 && secret.length() == 0) {
logger.debug("Using instance profile credentials"); logger.debug("Using instance profile credentials");
AWSCredentialsProvider credentials = new InstanceProfileCredentialsProvider(); return new PrivilegedInstanceProfileCredentialsProvider();
return new AWSCredentialsProvider() {
@Override
public AWSCredentials getCredentials() {
return SocketAccess.doPrivileged(credentials::getCredentials);
}
@Override
public void refresh() {
SocketAccess.doPrivilegedVoid(credentials::refresh);
}
};
} else { } else {
logger.debug("Using basic key/secret credentials"); logger.debug("Using basic key/secret credentials");
return new StaticCredentialsProvider(new BasicAWSCredentials(key.toString(), secret.toString())); return new StaticCredentialsProvider(new BasicAWSCredentials(key.toString(), secret.toString()));
@ -221,4 +210,22 @@ public class InternalAwsS3Service extends AbstractLifecycleComponent implements
// Ensure that IdleConnectionReaper is shutdown // Ensure that IdleConnectionReaper is shutdown
IdleConnectionReaper.shutdown(); IdleConnectionReaper.shutdown();
} }
static class PrivilegedInstanceProfileCredentialsProvider implements AWSCredentialsProvider {
private final InstanceProfileCredentialsProvider credentials;
private PrivilegedInstanceProfileCredentialsProvider() {
this.credentials = new InstanceProfileCredentialsProvider();
}
@Override
public AWSCredentials getCredentials() {
return SocketAccess.doPrivileged(credentials::getCredentials);
}
@Override
public void refresh() {
SocketAccess.doPrivilegedVoid(credentials::refresh);
}
}
} }

View File

@ -37,7 +37,7 @@ public class AwsS3ServiceImplTests extends ESTestCase {
public void testAWSCredentialsWithSystemProviders() { public void testAWSCredentialsWithSystemProviders() {
AWSCredentialsProvider credentialsProvider = AWSCredentialsProvider credentialsProvider =
InternalAwsS3Service.buildCredentials(logger, deprecationLogger, Settings.EMPTY, Settings.EMPTY, "default"); InternalAwsS3Service.buildCredentials(logger, deprecationLogger, Settings.EMPTY, Settings.EMPTY, "default");
assertThat(credentialsProvider, instanceOf(AWSCredentialsProvider.class)); assertThat(credentialsProvider, instanceOf(InternalAwsS3Service.PrivilegedInstanceProfileCredentialsProvider.class));
} }
public void testAwsCredsDefaultSettings() { public void testAwsCredsDefaultSettings() {

View File

@ -28,7 +28,7 @@ setup:
--- ---
"Basic": "Basic":
- skip: - skip:
version: " - 5.2.99" version: " - 5.99.99"
reason: this uses a new highlighter that has been added in 5.3 reason: this uses a new highlighter that has been added in 5.3
- do: - do:
search: search: