Merge branch 'master' into hlclient/add-delete-method

2025-03-09 14:34:43 +00:00 · 2017-02-24 09:23:03 +01:00 · 2017-02-24 09:23:03 +01:00 · 3e4b917066
commit 3e4b917066
parent 4ebc6dd0d0 211d50f7b8
22 changed files with 797 additions and 1251 deletions
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@ -140,6 +140,7 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
 import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
 import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
 import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
+import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
 import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
 import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
 import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
@ -225,6 +226,7 @@ public final class AnalysisModule {
        tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
        tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
        tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
+        tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
        tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
        tokenFilters.register("elision", ElisionTokenFilterFactory::new);
        tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);
--- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java
@ -51,6 +51,7 @@ import org.apache.lucene.analysis.miscellaneous.TrimFilter;
 import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@ -87,6 +88,18 @@ public enum PreBuiltTokenFilters {
        }
    },

+    WORD_DELIMITER_GRAPH(CachingStrategy.ONE) {
+        @Override
+        public TokenStream create(TokenStream tokenStream, Version version) {
+            return new WordDelimiterGraphFilter(tokenStream,
+                WordDelimiterGraphFilter.GENERATE_WORD_PARTS |
+                    WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS |
+                    WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE |
+                    WordDelimiterGraphFilter.SPLIT_ON_NUMERICS |
+                    WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
+        }
+    },
+
    STOP(CachingStrategy.LUCENE) {
        @Override
        public TokenStream create(TokenStream tokenStream, Version version) {
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java
@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;

 import org.apache.lucene.search.highlight.SimpleFragmenter;
 import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
+import org.elasticsearch.Version;
 import org.elasticsearch.action.support.ToXContentToBytes;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.ParsingException;
@ -32,10 +33,12 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryParseContext;
+import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
 import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;

 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.function.BiFunction;
@ -57,8 +60,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
    public static final ParseField NUMBER_OF_FRAGMENTS_FIELD = new ParseField("number_of_fragments");
    public static final ParseField ENCODER_FIELD = new ParseField("encoder");
    public static final ParseField REQUIRE_FIELD_MATCH_FIELD = new ParseField("require_field_match");
+    public static final ParseField BOUNDARY_SCANNER_FIELD = new ParseField("boundary_scanner");
    public static final ParseField BOUNDARY_MAX_SCAN_FIELD = new ParseField("boundary_max_scan");
    public static final ParseField BOUNDARY_CHARS_FIELD = new ParseField("boundary_chars");
+    public static final ParseField BOUNDARY_SCANNER_LOCALE_FIELD = new ParseField("boundary_scanner_locale");
    public static final ParseField TYPE_FIELD = new ParseField("type");
    public static final ParseField FRAGMENTER_FIELD = new ParseField("fragmenter");
    public static final ParseField NO_MATCH_SIZE_FIELD = new ParseField("no_match_size");
@ -88,10 +93,14 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB

    protected Boolean forceSource;

+    protected BoundaryScannerType boundaryScannerType;
+
    protected Integer boundaryMaxScan;

    protected char[] boundaryChars;

+    protected Locale boundaryScannerLocale;
+
    protected Integer noMatchSize;

    protected Integer phraseLimit;
@ -119,10 +128,18 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
        order(in.readOptionalWriteable(Order::readFromStream));
        highlightFilter(in.readOptionalBoolean());
        forceSource(in.readOptionalBoolean());
+        if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
+            boundaryScannerType(in.readOptionalWriteable(BoundaryScannerType::readFromStream));
+        }
        boundaryMaxScan(in.readOptionalVInt());
        if (in.readBoolean()) {
            boundaryChars(in.readString().toCharArray());
        }
+        if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
+            if (in.readBoolean()) {
+                boundaryScannerLocale(in.readString());
+            }
+        }
        noMatchSize(in.readOptionalVInt());
        phraseLimit(in.readOptionalVInt());
        if (in.readBoolean()) {
@ -150,12 +167,22 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
        out.writeOptionalWriteable(order);
        out.writeOptionalBoolean(highlightFilter);
        out.writeOptionalBoolean(forceSource);
+        if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
+            out.writeOptionalWriteable(boundaryScannerType);
+        }
        out.writeOptionalVInt(boundaryMaxScan);
        boolean hasBounaryChars = boundaryChars != null;
        out.writeBoolean(hasBounaryChars);
        if (hasBounaryChars) {
            out.writeString(String.valueOf(boundaryChars));
        }
+        if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
+            boolean hasBoundaryScannerLocale = boundaryScannerLocale != null;
+            out.writeBoolean(hasBoundaryScannerLocale);
+            if (hasBoundaryScannerLocale) {
+                out.writeString(boundaryScannerLocale.toLanguageTag());
+            }
+        }
        out.writeOptionalVInt(noMatchSize);
        out.writeOptionalVInt(phraseLimit);
        boolean hasOptions = options != null;
@ -331,6 +358,33 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
        return this.highlightFilter;
    }

+    /**
+     * When using the highlighterType <tt>fvh</tt> this setting
+     * controls which scanner to use for fragment boundaries, and defaults to "simple".
+     */
+    @SuppressWarnings("unchecked")
+    public HB boundaryScannerType(String boundaryScannerType) {
+        this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType);
+        return (HB) this;
+    }
+
+    /**
+     * When using the highlighterType <tt>fvh</tt> this setting
+     * controls which scanner to use for fragment boundaries, and defaults to "simple".
+     */
+    @SuppressWarnings("unchecked")
+    public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) {
+        this.boundaryScannerType = boundaryScannerType;
+        return (HB) this;
+    }
+
+    /**
+     * @return the value set by {@link #boundaryScannerType(String)}
+     */
+    public BoundaryScannerType boundaryScannerType() {
+        return this.boundaryScannerType;
+    }
+
    /**
     * When using the highlighterType <tt>fvh</tt> this setting
     * controls how far to look for boundary characters, and defaults to 20.
@ -366,6 +420,25 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
        return this.boundaryChars;
    }

+    /**
+     * When using the highlighterType <tt>fvh</tt> and boundaryScannerType <tt>break_iterator</tt>, this setting
+     * controls the locale to use by the BreakIterator, defaults to "root".
+     */
+    @SuppressWarnings("unchecked")
+    public HB boundaryScannerLocale(String boundaryScannerLocale) {
+        if (boundaryScannerLocale != null) {
+            this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale);
+        }
+        return (HB) this;
+    }
+
+    /**
+     * @return the value set by {@link #boundaryScannerLocale(String)}
+     */
+    public Locale boundaryScannerLocale() {
+        return this.boundaryScannerLocale;
+    }
+
    /**
     * Allows to set custom options for custom highlighters.
     */
@ -491,12 +564,18 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
        if (highlightFilter != null) {
            builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter);
        }
+        if (boundaryScannerType != null) {
+            builder.field(BOUNDARY_SCANNER_FIELD.getPreferredName(), boundaryScannerType.name());
+        }
        if (boundaryMaxScan != null) {
            builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan);
        }
        if (boundaryChars != null) {
            builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars));
        }
+        if (boundaryScannerLocale != null) {
+            builder.field(BOUNDARY_SCANNER_LOCALE_FIELD.getPreferredName(), boundaryScannerLocale.toLanguageTag());
+        }
        if (options != null && options.size() > 0) {
            builder.field(OPTIONS_FIELD.getPreferredName(), options);
        }
@ -523,8 +602,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
        parser.declareInt(HB::fragmentSize, FRAGMENT_SIZE_FIELD);
        parser.declareInt(HB::numOfFragments, NUMBER_OF_FRAGMENTS_FIELD);
        parser.declareBoolean(HB::requireFieldMatch, REQUIRE_FIELD_MATCH_FIELD);
+        parser.declareString(HB::boundaryScannerType, BOUNDARY_SCANNER_FIELD);
        parser.declareInt(HB::boundaryMaxScan, BOUNDARY_MAX_SCAN_FIELD);
        parser.declareString((HB hb, String bc) -> hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD);
+        parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD);
        parser.declareString(HB::highlighterType, TYPE_FIELD);
        parser.declareString(HB::fragmenter, FRAGMENTER_FIELD);
        parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
@ -562,8 +643,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
    public final int hashCode() {
        return Objects.hash(getClass(), Arrays.hashCode(preTags), Arrays.hashCode(postTags), fragmentSize,
                numOfFragments, highlighterType, fragmenter, highlightQuery, order, highlightFilter,
-                forceSource, boundaryMaxScan, Arrays.hashCode(boundaryChars), noMatchSize,
-                phraseLimit, options, requireFieldMatch, doHashCode());
+                forceSource, boundaryScannerType, boundaryMaxScan, Arrays.hashCode(boundaryChars), boundaryScannerLocale,
+                noMatchSize, phraseLimit, options, requireFieldMatch, doHashCode());
    }

    /**
@ -591,8 +672,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
               Objects.equals(order, other.order) &&
               Objects.equals(highlightFilter, other.highlightFilter) &&
               Objects.equals(forceSource, other.forceSource) &&
+               Objects.equals(boundaryScannerType, other.boundaryScannerType) &&
               Objects.equals(boundaryMaxScan, other.boundaryMaxScan) &&
               Arrays.equals(boundaryChars, other.boundaryChars) &&
+               Objects.equals(boundaryScannerLocale, other.boundaryScannerLocale) &&
               Objects.equals(noMatchSize, other.noMatchSize) &&
               Objects.equals(phraseLimit, other.phraseLimit) &&
               Objects.equals(options, other.options) &&
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java
@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
 import org.apache.lucene.search.highlight.Encoder;
 import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder;
 import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
+import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
 import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
 import org.apache.lucene.search.vectorhighlight.FieldFragList;
 import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
@ -38,15 +39,23 @@ import org.elasticsearch.common.text.Text;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
 import org.elasticsearch.search.fetch.FetchSubPhase;
+import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field;
+import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
 import org.elasticsearch.search.internal.SearchContext;

+import java.text.BreakIterator;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;

 public class FastVectorHighlighter implements Highlighter {

-    private static final SimpleBoundaryScanner DEFAULT_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
+    private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
+    private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
+            BreakIterator.getSentenceInstance(Locale.ROOT));
+    private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
+            BreakIterator.getWordInstance(Locale.ROOT));

    public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
        true, Setting.Property.NodeScope);
@ -105,12 +114,7 @@ public class FastVectorHighlighter implements Highlighter {
                FragListBuilder fragListBuilder;
                BaseFragmentsBuilder fragmentsBuilder;

-                BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER;
-                if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
-                        || field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
-                    boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(),
-                            field.fieldOptions().boundaryChars());
-                }
+                final BoundaryScanner boundaryScanner = getBoundaryScanner(field);
                boolean forceSource = context.highlight().forceSource(field);
                if (field.fieldOptions().numberOfFragments() == 0) {
                    fragListBuilder = new SingleFragListBuilder();
@ -206,6 +210,29 @@ public class FastVectorHighlighter implements Highlighter {
                && fieldMapper.fieldType().storeTermVectorPositions();
    }

+    private static BoundaryScanner getBoundaryScanner(Field field) {
+        final FieldOptions fieldOptions = field.fieldOptions();
+        final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
+        switch(fieldOptions.boundaryScannerType()) {
+        case SENTENCE:
+            if (boundaryScannerLocale != null) {
+                return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
+            }
+            return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
+        case WORD:
+            if (boundaryScannerLocale != null) {
+                return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
+            }
+            return DEFAULT_WORD_BOUNDARY_SCANNER;
+        default:
+            if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
+                    || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
+                return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
+            }
+            return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
+        }
+    }
+
    private class MapperHighlightEntry {
        public FragListBuilder fragListBuilder;
        public FragmentsBuilder fragmentsBuilder;
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java
@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
            .preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
            .highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
            .forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
-            .numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER)
+            .numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
            .boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
-            .noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
+            .boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();

    private final List<Field> fields = new ArrayList<>();

@ -327,12 +327,18 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
        if (highlighterBuilder.requireFieldMatch != null) {
            targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch);
        }
+        if (highlighterBuilder.boundaryScannerType != null) {
+            targetOptionsBuilder.boundaryScannerType(highlighterBuilder.boundaryScannerType);
+        }
        if (highlighterBuilder.boundaryMaxScan != null) {
            targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan);
        }
        if (highlighterBuilder.boundaryChars != null) {
            targetOptionsBuilder.boundaryChars(convertCharArray(highlighterBuilder.boundaryChars));
        }
+        if (highlighterBuilder.boundaryScannerLocale != null) {
+            targetOptionsBuilder.boundaryScannerLocale(highlighterBuilder.boundaryScannerLocale);
+        }
        if (highlighterBuilder.highlighterType != null) {
            targetOptionsBuilder.highlighterType(highlighterBuilder.highlighterType);
        }
@ -522,4 +528,30 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
            return name().toLowerCase(Locale.ROOT);
        }
    }
+
+    public enum BoundaryScannerType implements Writeable {
+        CHARS, WORD, SENTENCE;
+
+        public static BoundaryScannerType readFromStream(StreamInput in) throws IOException {
+            int ordinal = in.readVInt();
+            if (ordinal < 0 || ordinal >= values().length) {
+                throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]");
+            }
+            return values()[ordinal];
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeVInt(this.ordinal());
+        }
+
+        public static BoundaryScannerType fromString(String boundaryScannerType) {
+            return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT));
+        }
+
+        @Override
+        public String toString() {
+            return name().toLowerCase(Locale.ROOT);
+        }
+    }
 }
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java
@ -20,11 +20,13 @@
 package org.elasticsearch.search.fetch.subphase.highlight;

 import org.apache.lucene.search.Query;
+import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;

 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;

@ -110,10 +112,14 @@ public class SearchContextHighlight {

        private String fragmenter;

+        private BoundaryScannerType boundaryScannerType;
+
        private int boundaryMaxScan = -1;

        private Character[] boundaryChars = null;

+        private Locale boundaryScannerLocale;
+
        private Query highlightQuery;

        private int noMatchSize = -1;
@ -168,6 +174,10 @@ public class SearchContextHighlight {
            return fragmenter;
        }

+        public BoundaryScannerType boundaryScannerType() {
+            return boundaryScannerType;
+        }
+
        public int boundaryMaxScan() {
            return boundaryMaxScan;
        }
@ -176,6 +186,10 @@ public class SearchContextHighlight {
            return boundaryChars;
        }

+        public Locale boundaryScannerLocale() {
+            return boundaryScannerLocale;
+        }
+
        public Query highlightQuery() {
            return highlightQuery;
        }
@ -260,6 +274,11 @@ public class SearchContextHighlight {
                return this;
            }

+            Builder boundaryScannerType(BoundaryScannerType boundaryScanner) {
+                fieldOptions.boundaryScannerType = boundaryScanner;
+                return this;
+            }
+
            Builder boundaryMaxScan(int boundaryMaxScan) {
                fieldOptions.boundaryMaxScan = boundaryMaxScan;
                return this;
@ -270,6 +289,11 @@ public class SearchContextHighlight {
                return this;
            }

+            Builder boundaryScannerLocale(Locale boundaryScannerLocale) {
+                fieldOptions.boundaryScannerLocale = boundaryScannerLocale;
+                return this;
+            }
+
            Builder highlightQuery(Query highlightQuery) {
                fieldOptions.highlightQuery = highlightQuery;
                return this;
@ -324,12 +348,18 @@ public class SearchContextHighlight {
                if (fieldOptions.requireFieldMatch == null) {
                    fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch;
                }
+                if (fieldOptions.boundaryScannerType == null) {
+                    fieldOptions.boundaryScannerType = globalOptions.boundaryScannerType;
+                }
                if (fieldOptions.boundaryMaxScan == -1) {
                    fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan;
                }
                if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) {
                    fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length);
                }
+                if (fieldOptions.boundaryScannerLocale == null) {
+                    fieldOptions.boundaryScannerLocale = globalOptions.boundaryScannerLocale;
+                }
                if (fieldOptions.highlighterType == null) {
                    fieldOptions.highlighterType = globalOptions.highlighterType;
                }
--- a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java
+++ b/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java
--- a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
@ -0,0 +1,146 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Base class to test {@link WordDelimiterTokenFilterFactory}  and {@link WordDelimiterGraphTokenFilterFactory}
+ */
+public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
+    final String type;
+
+    public BaseWordDelimiterTokenFilterFactoryTestCase(String type) {
+        this.type = type;
+    }
+
+    public void testDefault() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
+            "fi", "4000", "j", "2", "se", "O", "Neil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testCatenateWords() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testCatenateNumbers() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
+                .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
+            "se", "O", "Neil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testCatenateAll() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
+                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
+                .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testSplitOnCaseChange() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        String[] expected = new String[]{"PowerShot"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testPreserveOriginal() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
+            "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testStemEnglishPossessive() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
+            "se", "O", "Neil", "s"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+}
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
@ -0,0 +1,75 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
+    public WordDelimiterGraphTokenFilterFactoryTests() {
+        super("word_delimiter_graph");
+    }
+
+    public void testMultiTerms() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_word_delimiter.type", type)
+            .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
+            .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
+            .build());
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
+            "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
+            "ONeil", "O'Neil's", "O", "Neil" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
+        int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
+                expectedIncr, expectedPosLen, null);
+    }
+
+    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
+    public void testPartsAndCatenate() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_word_delimiter.type", type)
+            .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+            .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+            .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        int[] expectedIncr = new int[]{1, 0, 1};
+        int[] expectedPosLen = new int[]{2, 1, 1};
+        String[] expected = new String[]{"PowerShot", "Power", "Shot" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
+            expectedIncr, expectedPosLen, null);
+    }
+}
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
@ -24,121 +24,23 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.test.ESTokenStreamTestCase;

 import java.io.IOException;
 import java.io.StringReader;

-public class WordDelimiterTokenFilterFactoryTests extends ESTokenStreamTestCase {
-    public void testDefault() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testCatenateWords() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testCatenateNumbers() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
-                .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testCatenateAll() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
-                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
-                .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testSplitOnCaseChange() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot";
-        String[] expected = new String[]{"PowerShot"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testPreserveOriginal() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testStemEnglishPossessive() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
+    public WordDelimiterTokenFilterFactoryTests() {
+        super("word_delimiter");
    }

    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
    public void testPartsAndCatenate() throws IOException {
        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
-                .build());
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_word_delimiter.type", type)
+            .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+            .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+            .build());
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
        String source = "PowerShot";
        String[] expected = new String[]{"Power", "PowerShot", "Shot" };
--- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java
+++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java
@ -47,6 +47,7 @@ import org.elasticsearch.index.query.QueryParseContext;
 import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.TermQueryBuilder;
 import org.elasticsearch.search.SearchModule;
+import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
 import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
 import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;
 import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
@ -288,6 +289,7 @@ public class HighlightBuilderTests extends ESTestCase {
                        mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions);

                checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars);
+                checkSame.accept(AbstractHighlighterBuilder::boundaryScannerType, FieldOptions::boundaryScannerType);
                checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan);
                checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize);
                checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter);
@ -557,12 +559,23 @@ public class HighlightBuilderTests extends ESTestCase {
        if (randomBoolean()) {
            highlightBuilder.forceSource(randomBoolean());
        }
+        if (randomBoolean()) {
+            if (randomBoolean()) {
+                highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()));
+            } else {
+                // also test the string setter
+                highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()).toString());
+            }
+        }
        if (randomBoolean()) {
            highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10));
        }
        if (randomBoolean()) {
            highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray());
        }
+        if (randomBoolean()) {
+            highlightBuilder.boundaryScannerLocale(randomLocale(random()).toLanguageTag());
+        }
        if (randomBoolean()) {
            highlightBuilder.noMatchSize(randomIntBetween(0, 10));
        }
--- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
+++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@ -44,6 +44,7 @@ import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
+import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
 import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
 import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.test.ESIntegTestCase;
@ -57,6 +58,7 @@ import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;

 import static org.elasticsearch.client.Requests.searchRequest;
@ -747,7 +749,94 @@ public class HighlighterSearchIT extends ESIntegTestCase {
        searchResponse = client().prepareSearch("test").setSource(source).get();

        assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
+    }

+    public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
+        assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
+        ensureGreen();
+
+        indexRandom(true, client().prepareIndex("test", "type1")
+                .setSource("field1", "A sentence with few words. Another sentence with even more words."));
+
+        logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
+        SearchSourceBuilder source = searchSource()
+                .query(termQuery("field1", "sentence"))
+                .highlighter(highlight()
+                        .field("field1", 20, 2)
+                        .order("score")
+                        .preTags("<xxx>").postTags("</xxx>")
+                        .boundaryScannerType(BoundaryScannerType.SENTENCE));
+
+        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+
+        assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
+        assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
+    }
+
+    public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
+        assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
+        ensureGreen();
+
+        indexRandom(true, client().prepareIndex("test", "type1")
+                .setSource("field1", "A sentence with few words. Another sentence with even more words."));
+
+        logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
+        SearchSourceBuilder source = searchSource()
+                .query(termQuery("field1", "sentence"))
+                .highlighter(highlight()
+                        .field("field1", 20, 2)
+                        .order("score")
+                        .preTags("<xxx>").postTags("</xxx>")
+                        .boundaryScannerType(BoundaryScannerType.SENTENCE)
+                        .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
+
+        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+
+        assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
+        assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
+    }
+
+    public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
+        assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
+        ensureGreen();
+
+        indexRandom(true, client().prepareIndex("test", "type1")
+                .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
+
+        logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
+        SearchSourceBuilder source = searchSource()
+                .query(termQuery("field1", "some"))
+                .highlighter(highlight()
+                        .field("field1", 23, 1)
+                        .order("score")
+                        .preTags("<xxx>").postTags("</xxx>")
+                        .boundaryScannerType(BoundaryScannerType.WORD));
+
+        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+
+        assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
+    }
+
+    public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
+        assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
+        ensureGreen();
+
+        indexRandom(true, client().prepareIndex("test", "type1")
+                .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
+
+        logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
+        SearchSourceBuilder source = searchSource()
+                .query(termQuery("field1", "some"))
+                .highlighter(highlight()
+                        .field("field1", 23, 1)
+                        .order("score")
+                        .preTags("<xxx>").postTags("</xxx>")
+                        .boundaryScannerType(BoundaryScannerType.WORD)
+                        .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
+
+        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+
+        assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
    }

    /**
--- a/docs/build.gradle
+++ b/docs/build.gradle
@ -81,6 +81,7 @@ buildRestTests.expectedUnconvertedCandidates = [
  'reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc',
+  'reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc',
  'reference/cat/snapshots.asciidoc',
  'reference/cat/templates.asciidoc',
  'reference/cat/thread_pool.asciidoc',
--- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@ -0,0 +1,97 @@
+[[analysis-word-delimiter-graph-tokenfilter]]
+=== Word Delimiter Graph Token Filter
+
+experimental[]
+
+Named `word_delimiter_graph`, it splits words into subwords and performs
+optional transformations on subword groups. Words are split into
+subwords with the following rules:
+
+* split on intra-word delimiters (by default, all non alpha-numeric
+characters).
+* "Wi-Fi" -> "Wi", "Fi"
+* split on case transitions: "PowerShot" -> "Power", "Shot"
+* split on letter-number transitions: "SD500" -> "SD", "500"
+* leading and trailing intra-word delimiters on each subword are
+ignored: "//hello---there, 'dude'" -> "hello", "there", "dude"
+* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil"
+
+Unlike the `word_delimiter`, this token filter correctly handles positions for
+multi terms expansion at search-time when any of the following options
+are set to true:
+
+ * `preserve_original`
+ * `catenate_numbers`
+ * `catenate_words`
+ * `catenate_all`
+
+Parameters include:
+
+`generate_word_parts`::
+    If `true` causes parts of words to be
+    generated: "PowerShot" => "Power" "Shot". Defaults to `true`.
+
+`generate_number_parts`::
+    If `true` causes number subwords to be
+    generated: "500-42" => "500" "42". Defaults to `true`.
+
+`catenate_words`::
+    If `true` causes maximum runs of word parts to be
+    catenated: "wi-fi" => "wifi". Defaults to `false`.
+
+`catenate_numbers`::
+    If `true` causes maximum runs of number parts to
+    be catenated: "500-42" => "50042". Defaults to `false`.
+
+`catenate_all`::
+    If `true` causes all subword parts to be catenated:
+    "wi-fi-4000" => "wifi4000". Defaults to `false`.
+
+`split_on_case_change`::
+    If `true` causes "PowerShot" to be two tokens;
+    ("Power-Shot" remains two parts regards). Defaults to `true`.
+
+`preserve_original`::
+    If `true` includes original words in subwords:
+    "500-42" => "500-42" "500" "42". Defaults to `false`.
+
+`split_on_numerics`::
+    If `true` causes "j2se" to be three tokens; "j"
+    "2" "se". Defaults to `true`.
+
+`stem_english_possessive`::
+    If `true` causes trailing "'s" to be
+    removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`.
+
+Advance settings include:
+
+`protected_words`::
+    A list of protected words from being delimiter.
+    Either an array, or also can set `protected_words_path` which resolved
+    to a file configured with protected words (one on each line).
+    Automatically resolves to `config/` based location if exists.
+
+`type_table`::
+    A custom type mapping table, for example (when configured
+    using `type_table_path`):
+
+[source,js]
+--------------------------------------------------
+    # Map the $, %, '.', and ',' characters to DIGIT
+    # This might be useful for financial data.
+    $ => DIGIT
+    % => DIGIT
+    . => DIGIT
+    \\u002C => DIGIT
+
+    # in some cases you might not want to split on ZWJ
+    # this also tests the case where we need a bigger byte[]
+    # see http://en.wikipedia.org/wiki/Zero-width_joiner
+    \\u200D => ALPHANUM
+--------------------------------------------------
+
+NOTE: Using a tokenizer like the `standard` tokenizer may interfere with
+the `catenate_*` and `preserve_original` parameters, as the original
+string may already have lost punctuation during tokenization.  Instead,
+you may want to use the `whitespace` tokenizer.
+
--- a/docs/reference/search/request/highlighting.asciidoc
+++ b/docs/reference/search/request/highlighting.asciidoc
@ -103,8 +103,7 @@ If `term_vector` information is provided by setting `term_vector` to
 will be used instead of the plain highlighter.  The fast vector highlighter:

 * Is faster especially for large fields (> `1MB`)
-* Can be customized with `boundary_chars`, `boundary_max_scan`, and
- `fragment_offset` (see <<boundary-characters,below>>)
+* Can be customized with `boundary_scanner` (see <<boundary-scanners,below>>)
 * Requires setting `term_vector` to `with_positions_offsets` which
  increases the size of the index
 * Can combine matches from multiple fields into one result.  See
@ -502,17 +501,23 @@ GET /_search
 --------------------------------------------------
 // CONSOLE

-[[boundary-characters]]
-==== Boundary Characters
+[[boundary-scanners]]
+==== Boundary Scanners

-When highlighting a field using the fast vector highlighter,
-`boundary_chars` can be configured to define what constitutes a boundary
-for highlighting. It's a single string with each boundary character
-defined in it. It defaults to `.,!? \t\n`.
+When highlighting a field using the fast vector highlighter, you can specify
+how to break the highlighted fragments using `boundary_scanner`, which accepts
+the following values:

-The `boundary_max_scan` allows to control how far to look for boundary
-characters, and defaults to `20`.
+* `chars` (default): allows to configure which characters (`boundary_chars`)
+constitute a boundary for highlighting. It's a single string with each boundary
+character defined in it (defaults to `.,!? \t\n`). It also allows configuring
+the `boundary_max_scan` to control how far to look for boundary characters
+(defaults to `20`).

+* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
+to break the highlighted fragments at the next _word_ or _sentence_ boundary.
+You can further specify `boundary_scanner_locale` to control which Locale is used
+to search the text for these boundaries.

 [[matched-fields]]
 ==== Matched Fields
--- a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/DatabaseReaderLazyLoader.java
+++ b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/DatabaseReaderLazyLoader.java
@ -0,0 +1,62 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.ingest.geoip;
+
+import com.maxmind.geoip2.DatabaseReader;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.SetOnce;
+import org.elasticsearch.common.CheckedSupplier;
+import org.elasticsearch.common.logging.Loggers;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Facilitates lazy loading of the database reader, so that when the geoip plugin is installed, but not used,
+ * no memory is being wasted on the database reader.
+ */
+final class DatabaseReaderLazyLoader implements Closeable {
+
+    private static final Logger LOGGER = Loggers.getLogger(DatabaseReaderLazyLoader.class);
+
+    private final String databaseFileName;
+    private final CheckedSupplier<DatabaseReader, IOException> loader;
+    // package protected for testing only:
+    final SetOnce<DatabaseReader> databaseReader;
+
+    DatabaseReaderLazyLoader(String databaseFileName, CheckedSupplier<DatabaseReader, IOException> loader) {
+        this.databaseFileName = databaseFileName;
+        this.loader = loader;
+        this.databaseReader = new SetOnce<>();
+    }
+
+    synchronized DatabaseReader get() throws IOException {
+        if (databaseReader.get() == null) {
+            databaseReader.set(loader.get());
+            LOGGER.debug("Loaded [{}] geoip database", databaseFileName);
+        }
+        return databaseReader.get();
+    }
+
+    @Override
+    public synchronized void close() throws IOException {
+        IOUtils.close(databaseReader.get());
+    }
+}
--- a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/GeoIpProcessor.java
+++ b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/GeoIpProcessor.java
@ -19,19 +19,6 @@

 package org.elasticsearch.ingest.geoip;

-import java.io.IOException;
-import java.net.InetAddress;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-
 import com.maxmind.geoip2.DatabaseReader;
 import com.maxmind.geoip2.exception.AddressNotFoundException;
 import com.maxmind.geoip2.model.CityResponse;
@ -49,6 +36,19 @@ import org.elasticsearch.ingest.AbstractProcessor;
 import org.elasticsearch.ingest.IngestDocument;
 import org.elasticsearch.ingest.Processor;

+import java.io.IOException;
+import java.net.InetAddress;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
 import static org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException;
 import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty;
 import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList;
@ -264,9 +264,9 @@ public final class GeoIpProcessor extends AbstractProcessor {
        );
        static final Set<Property> DEFAULT_COUNTRY_PROPERTIES = EnumSet.of(Property.CONTINENT_NAME, Property.COUNTRY_ISO_CODE);

-        private final Map<String, DatabaseReader> databaseReaders;
+        private final Map<String, DatabaseReaderLazyLoader> databaseReaders;

-        public Factory(Map<String, DatabaseReader> databaseReaders) {
+        public Factory(Map<String, DatabaseReaderLazyLoader> databaseReaders) {
            this.databaseReaders = databaseReaders;
        }

@ -279,12 +279,13 @@ public final class GeoIpProcessor extends AbstractProcessor {
            List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties");
            boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);

-            DatabaseReader databaseReader = databaseReaders.get(databaseFile);
-            if (databaseReader == null) {
+            DatabaseReaderLazyLoader lazyLoader = databaseReaders.get(databaseFile);
+            if (lazyLoader == null) {
                throw newConfigurationException(TYPE, processorTag,
                    "database_file", "database file [" + databaseFile + "] doesn't exist");
            }

+            DatabaseReader databaseReader = lazyLoader.get();
            String databaseType = databaseReader.getMetadata().getDatabaseType();

            final Set<Property> properties;
--- a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/IngestGeoIpPlugin.java
+++ b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/IngestGeoIpPlugin.java
@ -19,6 +19,15 @@

 package org.elasticsearch.ingest.geoip;

+import com.maxmind.db.NoCache;
+import com.maxmind.db.NodeCache;
+import com.maxmind.geoip2.DatabaseReader;
+import org.apache.lucene.util.IOUtils;
+import org.elasticsearch.common.settings.Setting;
+import org.elasticsearch.ingest.Processor;
+import org.elasticsearch.plugins.IngestPlugin;
+import org.elasticsearch.plugins.Plugin;
+
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.InputStream;
@ -35,20 +44,11 @@ import java.util.Map;
 import java.util.stream.Stream;
 import java.util.zip.GZIPInputStream;

-import com.maxmind.db.NoCache;
-import com.maxmind.db.NodeCache;
-import com.maxmind.geoip2.DatabaseReader;
-import org.apache.lucene.util.IOUtils;
-import org.elasticsearch.common.settings.Setting;
-import org.elasticsearch.ingest.Processor;
-import org.elasticsearch.plugins.IngestPlugin;
-import org.elasticsearch.plugins.Plugin;
-
 public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable {
    public static final Setting<Long> CACHE_SIZE =
        Setting.longSetting("ingest.geoip.cache_size", 1000, 0, Setting.Property.NodeScope);

-    private Map<String, DatabaseReader> databaseReaders;
+    private Map<String, DatabaseReaderLazyLoader> databaseReaders;

    @Override
    public List<Setting<?>> getSettings() {
@ -76,12 +76,12 @@ public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable
        return Collections.singletonMap(GeoIpProcessor.TYPE, new GeoIpProcessor.Factory(databaseReaders));
    }

-    static Map<String, DatabaseReader> loadDatabaseReaders(Path geoIpConfigDirectory, NodeCache cache) throws IOException {
+    static Map<String, DatabaseReaderLazyLoader> loadDatabaseReaders(Path geoIpConfigDirectory, NodeCache cache) throws IOException {
        if (Files.exists(geoIpConfigDirectory) == false && Files.isDirectory(geoIpConfigDirectory)) {
            throw new IllegalStateException("the geoip directory [" + geoIpConfigDirectory  + "] containing databases doesn't exist");
        }

-        Map<String, DatabaseReader> databaseReaders = new HashMap<>();
+        Map<String, DatabaseReaderLazyLoader> databaseReaders = new HashMap<>();
        try (Stream<Path> databaseFiles = Files.list(geoIpConfigDirectory)) {
            PathMatcher pathMatcher = geoIpConfigDirectory.getFileSystem().getPathMatcher("glob:**.mmdb.gz");
            // Use iterator instead of forEach otherwise IOException needs to be caught twice...
@ -89,10 +89,13 @@ public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable
            while (iterator.hasNext()) {
                Path databasePath = iterator.next();
                if (Files.isRegularFile(databasePath) && pathMatcher.matches(databasePath)) {
-                    try (InputStream inputStream = new GZIPInputStream(Files.newInputStream(databasePath, StandardOpenOption.READ))) {
-                        databaseReaders.put(databasePath.getFileName().toString(),
-                            new DatabaseReader.Builder(inputStream).withCache(cache).build());
-                    }
+                    String databaseFileName = databasePath.getFileName().toString();
+                    DatabaseReaderLazyLoader holder = new DatabaseReaderLazyLoader(databaseFileName, () -> {
+                        try (InputStream inputStream = new GZIPInputStream(Files.newInputStream(databasePath, StandardOpenOption.READ))) {
+                            return new DatabaseReader.Builder(inputStream).withCache(cache).build();
+                        }
+                    });
+                    databaseReaders.put(databaseFileName, holder);
                }
            }
        }
--- a/plugins/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpProcessorFactoryTests.java
+++ b/plugins/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpProcessorFactoryTests.java
@ -22,7 +22,6 @@ package org.elasticsearch.ingest.geoip;
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import com.maxmind.db.NoCache;
 import com.maxmind.db.NodeCache;
-import com.maxmind.geoip2.DatabaseReader;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.Randomness;
 import org.elasticsearch.test.ESTestCase;
@ -48,7 +47,7 @@ import static org.hamcrest.Matchers.sameInstance;

 public class GeoIpProcessorFactoryTests extends ESTestCase {

-    private static Map<String, DatabaseReader> databaseReaders;
+    private static Map<String, DatabaseReaderLazyLoader> databaseReaders;

    @BeforeClass
    public static void loadDatabaseReaders() throws IOException {
@ -66,7 +65,7 @@ public class GeoIpProcessorFactoryTests extends ESTestCase {

    @AfterClass
    public static void closeDatabaseReaders() throws IOException {
-        for (DatabaseReader reader : databaseReaders.values()) {
+        for (DatabaseReaderLazyLoader reader : databaseReaders.values()) {
            reader.close();
        }
        databaseReaders = null;
@ -222,4 +221,37 @@ public class GeoIpProcessorFactoryTests extends ESTestCase {
            assertThat(e.getMessage(), equalTo("[properties] property isn't a list, but of type [java.lang.String]"));
        }
    }
+
+    public void testLazyLoading() throws Exception {
+        Path configDir = createTempDir();
+        Path geoIpConfigDir = configDir.resolve("ingest-geoip");
+        Files.createDirectories(geoIpConfigDir);
+        Files.copy(new ByteArrayInputStream(StreamsUtils.copyToBytesFromClasspath("/GeoLite2-City.mmdb.gz")),
+            geoIpConfigDir.resolve("GeoLite2-City.mmdb.gz"));
+        Files.copy(new ByteArrayInputStream(StreamsUtils.copyToBytesFromClasspath("/GeoLite2-Country.mmdb.gz")),
+            geoIpConfigDir.resolve("GeoLite2-Country.mmdb.gz"));
+
+        // Loading another database reader instances, because otherwise we can't test lazy loading as the the
+        // database readers used at class level are reused between tests. (we want to keep that otherwise running this
+        // test will take roughly 4 times more time)
+        Map<String, DatabaseReaderLazyLoader> databaseReaders =
+            IngestGeoIpPlugin.loadDatabaseReaders(geoIpConfigDir, NoCache.getInstance());
+        GeoIpProcessor.Factory factory = new GeoIpProcessor.Factory(databaseReaders);
+        for (DatabaseReaderLazyLoader lazyLoader : databaseReaders.values()) {
+            assertNull(lazyLoader.databaseReader.get());
+        }
+
+        Map<String, Object> config = new HashMap<>();
+        config.put("field", "_field");
+        config.put("database_file", "GeoLite2-City.mmdb.gz");
+        factory.create(null, "_tag", config);
+        config = new HashMap<>();
+        config.put("field", "_field");
+        config.put("database_file", "GeoLite2-Country.mmdb.gz");
+        factory.create(null, "_tag", config);
+
+        for (DatabaseReaderLazyLoader lazyLoader : databaseReaders.values()) {
+            assertNotNull(lazyLoader.databaseReader.get());
+        }
+    }
 }
--- a/plugins/repository-s3/src/main/java/org/elasticsearch/cloud/aws/InternalAwsS3Service.java
+++ b/plugins/repository-s3/src/main/java/org/elasticsearch/cloud/aws/InternalAwsS3Service.java
@ -150,18 +150,7 @@ public class InternalAwsS3Service extends AbstractLifecycleComponent implements

            if (key.length() == 0 && secret.length() == 0) {
                logger.debug("Using instance profile credentials");
-                AWSCredentialsProvider credentials = new InstanceProfileCredentialsProvider();
-                return new AWSCredentialsProvider() {
-                    @Override
-                    public AWSCredentials getCredentials() {
-                        return SocketAccess.doPrivileged(credentials::getCredentials);
-                    }
-
-                    @Override
-                    public void refresh() {
-                        SocketAccess.doPrivilegedVoid(credentials::refresh);
-                    }
-                };
+                return new PrivilegedInstanceProfileCredentialsProvider();
            } else {
                logger.debug("Using basic key/secret credentials");
                return new StaticCredentialsProvider(new BasicAWSCredentials(key.toString(), secret.toString()));
@ -221,4 +210,22 @@ public class InternalAwsS3Service extends AbstractLifecycleComponent implements
        // Ensure that IdleConnectionReaper is shutdown
        IdleConnectionReaper.shutdown();
    }
+
+    static class PrivilegedInstanceProfileCredentialsProvider implements AWSCredentialsProvider {
+        private final InstanceProfileCredentialsProvider credentials;
+
+        private PrivilegedInstanceProfileCredentialsProvider() {
+            this.credentials = new InstanceProfileCredentialsProvider();
+        }
+
+        @Override
+        public AWSCredentials getCredentials() {
+            return SocketAccess.doPrivileged(credentials::getCredentials);
+        }
+
+        @Override
+        public void refresh() {
+            SocketAccess.doPrivilegedVoid(credentials::refresh);
+        }
+    }
 }
--- a/plugins/repository-s3/src/test/java/org/elasticsearch/cloud/aws/AwsS3ServiceImplTests.java
+++ b/plugins/repository-s3/src/test/java/org/elasticsearch/cloud/aws/AwsS3ServiceImplTests.java
@ -37,7 +37,7 @@ public class AwsS3ServiceImplTests extends ESTestCase {
    public void testAWSCredentialsWithSystemProviders() {
        AWSCredentialsProvider credentialsProvider =
            InternalAwsS3Service.buildCredentials(logger, deprecationLogger, Settings.EMPTY, Settings.EMPTY, "default");
-        assertThat(credentialsProvider, instanceOf(AWSCredentialsProvider.class));
+        assertThat(credentialsProvider, instanceOf(InternalAwsS3Service.PrivilegedInstanceProfileCredentialsProvider.class));
    }

    public void testAwsCredsDefaultSettings() {
--- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yaml
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yaml
@ -28,7 +28,7 @@ setup:
 ---
 "Basic":
  - skip:
-      version: " - 5.2.99"
+      version: " - 5.99.99"
      reason:  this uses a new highlighter that has been added in 5.3
  - do:
      search: