diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 1aaf3077aea..61950942e60 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -140,6 +140,7 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory; import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; +import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory; import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; @@ -225,6 +226,7 @@ public final class AnalysisModule { tokenFilters.register("snowball", SnowballTokenFilterFactory::new); tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new); + tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); tokenFilters.register("elision", ElisionTokenFilterFactory::new); tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 53e79cb9dfe..6c58ab884db 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -51,6 +51,7 @@ import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; @@ -87,6 +88,18 @@ public enum PreBuiltTokenFilters { } }, + WORD_DELIMITER_GRAPH(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new WordDelimiterGraphFilter(tokenStream, + WordDelimiterGraphFilter.GENERATE_WORD_PARTS | + WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | + WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | + WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | + WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); + } + }, + STOP(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java index e3a78227d9c..3a3c1cfd66d 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.elasticsearch.Version; import org.elasticsearch.action.support.ToXContentToBytes; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; @@ -32,10 +33,12 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import java.io.IOException; import java.util.Arrays; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.BiFunction; @@ -57,8 +60,10 @@ public abstract class AbstractHighlighterBuilderfvh this setting + * controls which scanner to use for fragment boundaries, and defaults to "simple". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerType(String boundaryScannerType) { + this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType); + return (HB) this; + } + + /** + * When using the highlighterType fvh this setting + * controls which scanner to use for fragment boundaries, and defaults to "simple". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) { + this.boundaryScannerType = boundaryScannerType; + return (HB) this; + } + + /** + * @return the value set by {@link #boundaryScannerType(String)} + */ + public BoundaryScannerType boundaryScannerType() { + return this.boundaryScannerType; + } + /** * When using the highlighterType fvh this setting * controls how far to look for boundary characters, and defaults to 20. @@ -366,6 +420,25 @@ public abstract class AbstractHighlighterBuilderfvh and boundaryScannerType break_iterator, this setting + * controls the locale to use by the BreakIterator, defaults to "root". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerLocale(String boundaryScannerLocale) { + if (boundaryScannerLocale != null) { + this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale); + } + return (HB) this; + } + + /** + * @return the value set by {@link #boundaryScannerLocale(String)} + */ + public Locale boundaryScannerLocale() { + return this.boundaryScannerLocale; + } + /** * Allows to set custom options for custom highlighters. */ @@ -491,12 +564,18 @@ public abstract class AbstractHighlighterBuilder 0) { builder.field(OPTIONS_FIELD.getPreferredName(), options); } @@ -523,8 +602,10 @@ public abstract class AbstractHighlighterBuilder hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD); + parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD); parser.declareString(HB::highlighterType, TYPE_FIELD); parser.declareString(HB::fragmenter, FRAGMENTER_FIELD); parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD); @@ -562,8 +643,8 @@ public abstract class AbstractHighlighterBuilder SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope); @@ -105,12 +114,7 @@ public class FastVectorHighlighter implements Highlighter { FragListBuilder fragListBuilder; BaseFragmentsBuilder fragmentsBuilder; - BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER; - if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN - || field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { - boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(), - field.fieldOptions().boundaryChars()); - } + final BoundaryScanner boundaryScanner = getBoundaryScanner(field); boolean forceSource = context.highlight().forceSource(field); if (field.fieldOptions().numberOfFragments() == 0) { fragListBuilder = new SingleFragListBuilder(); @@ -206,6 +210,29 @@ public class FastVectorHighlighter implements Highlighter { && fieldMapper.fieldType().storeTermVectorPositions(); } + private static BoundaryScanner getBoundaryScanner(Field field) { + final FieldOptions fieldOptions = field.fieldOptions(); + final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); + switch(fieldOptions.boundaryScannerType()) { + case SENTENCE: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); + } + return DEFAULT_SENTENCE_BOUNDARY_SCANNER; + case WORD: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); + } + return DEFAULT_WORD_BOUNDARY_SCANNER; + default: + if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN + || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { + return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); + } + return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + } + } + private class MapperHighlightEntry { public FragListBuilder fragListBuilder; public FragmentsBuilder fragmentsBuilder; diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index a063b2900d5..45b8c612a76 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder fields = new ArrayList<>(); @@ -327,12 +327,18 @@ public class HighlightBuilder extends AbstractHighlighterBuilder= values().length) { + throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]"); + } + return values()[ordinal]; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(this.ordinal()); + } + + public static BoundaryScannerType fromString(String boundaryScannerType) { + return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT)); + } + + @Override + public String toString() { + return name().toLowerCase(Locale.ROOT); + } + } } diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java index d4731718793..2baf73ab5fa 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java @@ -20,11 +20,13 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.search.Query; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -110,10 +112,14 @@ public class SearchContextHighlight { private String fragmenter; + private BoundaryScannerType boundaryScannerType; + private int boundaryMaxScan = -1; private Character[] boundaryChars = null; + private Locale boundaryScannerLocale; + private Query highlightQuery; private int noMatchSize = -1; @@ -168,6 +174,10 @@ public class SearchContextHighlight { return fragmenter; } + public BoundaryScannerType boundaryScannerType() { + return boundaryScannerType; + } + public int boundaryMaxScan() { return boundaryMaxScan; } @@ -176,6 +186,10 @@ public class SearchContextHighlight { return boundaryChars; } + public Locale boundaryScannerLocale() { + return boundaryScannerLocale; + } + public Query highlightQuery() { return highlightQuery; } @@ -260,6 +274,11 @@ public class SearchContextHighlight { return this; } + Builder boundaryScannerType(BoundaryScannerType boundaryScanner) { + fieldOptions.boundaryScannerType = boundaryScanner; + return this; + } + Builder boundaryMaxScan(int boundaryMaxScan) { fieldOptions.boundaryMaxScan = boundaryMaxScan; return this; @@ -270,6 +289,11 @@ public class SearchContextHighlight { return this; } + Builder boundaryScannerLocale(Locale boundaryScannerLocale) { + fieldOptions.boundaryScannerLocale = boundaryScannerLocale; + return this; + } + Builder highlightQuery(Query highlightQuery) { fieldOptions.highlightQuery = highlightQuery; return this; @@ -324,12 +348,18 @@ public class SearchContextHighlight { if (fieldOptions.requireFieldMatch == null) { fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch; } + if (fieldOptions.boundaryScannerType == null) { + fieldOptions.boundaryScannerType = globalOptions.boundaryScannerType; + } if (fieldOptions.boundaryMaxScan == -1) { fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan; } if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) { fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length); } + if (fieldOptions.boundaryScannerLocale == null) { + fieldOptions.boundaryScannerLocale = globalOptions.boundaryScannerLocale; + } if (fieldOptions.highlighterType == null) { fieldOptions.highlighterType = globalOptions.highlighterType; } diff --git a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java b/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java deleted file mode 100644 index fafe8a954c8..00000000000 --- a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis.synonym; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CannedTokenStream; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockGraphTokenFilter; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TokenStreamToTermAutomatonQuery; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.AutomatonTestUtil; -import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.Util; - -import java.io.IOException; -import java.io.StringReader; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -public class SynonymGraphFilterTests extends BaseTokenStreamTestCase { - - /** - * Set a side effect by {@link #getAnalyzer}. - */ - private SynonymGraphFilter synFilter; - - // LUCENE-6664 - public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[] - posLengths) throws IOException { - assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths); - } - - public void testBasicKeepOrigOneOutput() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b", new String[]{"c", "x", "a", "b"}, new int[]{0, 2, 2, 4}, new int[]{1, 5, 3, 5}, new String[]{"word", - "SYNONYM", "word", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 2, 1, 1}); - a.close(); - } - - public void testMixedKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - add(b, "e f", "y", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c e f g", new String[]{"c", "x", "a", "b", "c", "y", "g"}, new int[]{0, 2, 2, 4, 6, 8, 12}, new - int[]{1, 5, 3, 5, 7, 11, 13}, new String[]{"word", "SYNONYM", "word", "word", "word", "SYNONYM", "word"}, new - int[]{1, 1, 0, - 1, 1, 1, 1}, new int[]{1, 2, 1, 1, 1, 1, 1}); - a.close(); - } - - public void testNoParseAfterBuffer() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "b a", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "b b b", new String[]{"b", "b", "b"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - a.close(); - } - - public void testOneInputMultipleOutputKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - add(b, "a b", "y", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "a", "b", "c"}, new int[]{0, 2, 2, 2, 4, 6}, new int[]{1, 5, 5, 3, 5, - 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1}, new - int[]{1, 2, 2, - 1, 1, 1, 1, 1}); - a.close(); - } - - /** - * parse a syn file with bad syntax - */ - public void testInvalidAnalyzesToNothingOutput() throws Exception { - String testFile = "a => 1"; - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, false); - SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); - try { - parser.parse(new StringReader(testFile)); - fail("didn't get expected exception"); - } catch (ParseException expected) { - // expected exc - } - analyzer.close(); - } - - /** - * parse a syn file with bad syntax - */ - public void testInvalidDoubleMap() throws Exception { - String testFile = "a => b => c"; - Analyzer analyzer = new MockAnalyzer(random()); - SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); - try { - parser.parse(new StringReader(testFile)); - fail("didn't get expected exception"); - } catch (ParseException expected) { - // expected exc - } - analyzer.close(); - } - - public void testMoreThanOneLookAhead() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c d", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "a b c e", new String[]{"a", "b", "c", "e"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new - String[]{"word", "word", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testLookaheadAfterParse() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "b b", "x", true); - add(b, "b", "y", true); - - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "b a b b", new String[]{"y", "b", "a", "x", "b", "b"}, new int[]{0, 0, 2, 4, 4, 6}, new int[]{1, 1, 3, 7, 5, - 7}, null, new int[]{1, 0, 1, 1, 0, 1}, new int[]{1, 1, 1, 2, 1, 1}, true); - } - - public void testLookaheadSecondParse() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "b b b", "x", true); - add(b, "b", "y", true); - - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "b b", new String[]{"y", "b", "y", "b"}, new int[]{0, 0, 2, 2}, new int[]{1, 1, 3, 3}, null, new int[]{1, 0, - 1, 0}, new int[]{1, 1, 1, 1}, true); - } - - public void testOneInputMultipleOutputNoKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", false); - add(b, "a b", "y", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "c"}, new int[]{0, 2, 2, 6}, new int[]{1, 5, 5, 7}, new - String[]{"word", "SYNONYM", "SYNONYM", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testOneInputMultipleOutputMixedKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - add(b, "a b", "y", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "a", "b", "c"}, new int[]{0, 2, 2, 2, 4, 6}, new int[]{1, 5, 5, 3, 5, - 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1}, new - int[]{1, 2, 2, - 1, 1, 1, 1, 1}); - a.close(); - } - - public void testSynAtEnd() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c d e a b", new String[]{"c", "d", "e", "x", "a", "b"}, new int[]{0, 2, 4, 6, 6, 8}, new int[]{1, 3, 5, 9, - 7, 9}, new String[]{"word", "word", "word", "SYNONYM", "word", "word"}, new int[]{1, 1, 1, 1, 0, 1}, new int[]{1, 1, 1, - 2, 1, - 1}); - a.close(); - } - - public void testTwoSynsInARow() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a", "x", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a a b", new String[]{"c", "x", "x", "b"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new - String[]{"word", "SYNONYM", "SYNONYM", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testBasicKeepOrigTwoOutputs() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", true); - add(b, "a b", "m n o", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b d", new String[]{"c", "x", "m", "a", "y", "n", "o", "b", "d"}, new int[]{0, 2, 2, 2, 2, 2, 2, 4, 6}, - new int[]{1, 5, 5, 3, 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "SYNONYM", - "SYNONYM", "SYNONYM", - "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1, 1}, new int[]{1, 1, 2, 4, 4, 1, 2, 1, 1}); - a.close(); - } - - public void testNoCaptureIfNoMatch() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", true); - - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "c d d", new String[]{"c", "d", "d"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - assertEquals(0, synFilter.getCaptureCount()); - a.close(); - } - - public void testBasicNotKeepOrigOneOutput() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b", new String[]{"c", "x"}, new int[]{0, 2}, new int[]{1, 5}, new String[]{"word", "SYNONYM"}, new - int[]{1, 1}, new int[]{1, 1}); - a.close(); - } - - public void testBasicNoKeepOrigTwoOutputs() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", false); - add(b, "a b", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b d", new String[]{"c", "x", "m", "y", "n", "o", "d"}, new int[]{0, 2, 2, 2, 2, 2, 6}, new int[]{1, 5, - 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", - "word"}, new int[]{1, 1, 0, 1, 1, - 1, 1}, new int[]{1, 1, 2, 3, 1, 1, 1}); - a.close(); - } - - public void testIgnoreCase() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", false); - add(b, "a b", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c A B D", new String[]{"c", "x", "m", "y", "n", "o", "D"}, new int[]{0, 2, 2, 2, 2, 2, 6}, new int[]{1, 5, - 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", - "word"}, new int[]{1, 1, 0, 1, 1, - 1, 1}, new int[]{1, 1, 2, 3, 1, 1, 1}); - a.close(); - } - - public void testDoNotIgnoreCase() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", false); - add(b, "a b", "m n o", false); - - Analyzer a = getAnalyzer(b, false); - assertAnalyzesTo(a, "c A B D", new String[]{"c", "A", "B", "D"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new - String[]{"word", "word", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testBufferedFinish1() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b", new String[]{"c", "a", "b"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - a.close(); - } - - public void testBufferedFinish2() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "m n o", false); - add(b, "d e", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a d", new String[]{"c", "a", "d"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - a.close(); - } - - public void testCanReuse() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - Analyzer a = getAnalyzer(b, true); - for (int i = 0; i < 10; i++) { - assertAnalyzesTo(a, "c a b", new String[]{"c", "x", "a", "b"}, new int[]{0, 2, 2, 4}, new int[]{1, 5, 3, 5}, new - String[]{"word", "SYNONYM", "word", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 2, 1, 1}); - } - a.close(); - } - - /** - * Multiple input tokens map to a single output token - */ - public void testManyToOne() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c", "z", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "a b c d", new String[]{"z", "a", "b", "c", "d"}, new int[]{0, 0, 2, 4, 6}, new int[]{5, 1, 3, 5, 7}, new - String[]{"SYNONYM", "word", "word", "word", "word"}, new int[]{1, 0, 1, 1, 1}, new int[]{3, 1, 1, 1, 1}); - a.close(); - } - - public void testBufferAfterMatch() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c d", "x", true); - add(b, "a b", "y", false); - - // The 'c' token has to be buffered because SynGraphFilter - // needs to know whether a b c d -> x matches: - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "f a b c e", new String[]{"f", "y", "c", "e"}, new int[]{0, 2, 6, 8}, new int[]{1, 5, 7, 9}, new - String[]{"word", "SYNONYM", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testZeroSyns() throws Exception { - Tokenizer tokenizer = new MockTokenizer(); - tokenizer.setReader(new StringReader("aa bb")); - try { - new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true); - fail("did not hit expected exception"); - } catch (IllegalArgumentException iae) { - // expected - assertEquals("fst must be non-null", iae.getMessage()); - } - } - - // Needs TermAutomatonQuery, which is in sandbox still: - public void testAccurateGraphQuery1() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "wtf happened", Field.Store.NO)); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - IndexSearcher s = newSearcher(r); - - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "what the fudge", "wtf", true); - - SynonymMap map = b.build(); - - TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); - - - TokenStream in = new CannedTokenStream(0, 23, token("what", 1, 1, 0, 4), token("the", 1, 1, 5, 8), token("fudge", 1, 1, 9, 14), - token("happened", 1, 1, 15, 23)); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - in = new CannedTokenStream(0, 12, token("wtf", 1, 1, 0, 3), token("happened", 1, 1, 4, 12)); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - // "what happened" should NOT match: - in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13)); - assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - IOUtils.close(r, dir); - } - - - /** - * If we expand synonyms at search time, the results are correct. - */ - // Needs TermAutomatonQuery, which is in sandbox still: - public void testAccurateGraphQuery2() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "say wtf happened", Field.Store.NO)); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - IndexSearcher s = newSearcher(r); - - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "fudge", "chocolate", true); - add(b, "what the fudge", "wtf", true); - add(b, "what the", "wut", true); - add(b, "say", "say what", true); - - SynonymMap map = b.build(); - - TokenStream in = new CannedTokenStream(0, 26, token("say", 1, 1, 0, 3), token("what", 1, 1, 3, 7), token("the", 1, 1, 8, 11), - token("fudge", 1, 1, 12, 17), token("happened", 1, 1, 18, 26)); - - TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - // "what happened" should NOT match: - in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13)); - assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - IOUtils.close(r, dir); - } - - - // Needs TermAutomatonQuery, which is in sandbox still: - public void testAccurateGraphQuery3() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "say what the fudge happened", Field.Store.NO)); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - IndexSearcher s = newSearcher(r); - - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "wtf", "what the fudge", true); - - SynonymMap map = b.build(); - - TokenStream in = new CannedTokenStream(0, 15, token("say", 1, 1, 0, 3), token("wtf", 1, 1, 3, 6), token("happened", 1, 1, 7, 15)); - - TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - // "what happened" should NOT match: - in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13)); - assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - IOUtils.close(r, dir); - } - - private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { - final Token t = new Token(term, startOffset, endOffset); - t.setPositionIncrement(posInc); - t.setPositionLength(posLength); - return t; - } - - private String randomNonEmptyString() { - while (true) { - String s = TestUtil.randomUnicodeString(random()).trim(); - //String s = TestUtil.randomSimpleString(random()).trim(); - if (s.length() != 0 && s.indexOf('\u0000') == -1) { - return s; - } - } - } - - // Adds MockGraphTokenFilter after SynFilter: - public void testRandomGraphAfter() throws Exception { - final int numIters = atLeast(3); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final SynonymMap map = b.build(); - final boolean ignoreCase = random().nextBoolean(); - - final Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); - TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase); - TokenStream graph = new MockGraphTokenFilter(random(), syns); - return new TokenStreamComponents(tokenizer, graph); - } - }; - - checkRandomData(random(), analyzer, 100); - analyzer.close(); - } - } - - public void testEmptyStringInput() throws IOException { - final int numIters = atLeast(10); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - Analyzer analyzer = getAnalyzer(b, ignoreCase); - - checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); - analyzer.close(); - } - } - - /** - * simple random test, doesn't verify correctness. - * does verify it doesnt throw exceptions, or that the stream doesn't misbehave - */ - public void testRandom2() throws Exception { - final int numIters = atLeast(3); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - Analyzer analyzer = getAnalyzer(b, ignoreCase); - checkRandomData(random(), analyzer, 100); - analyzer.close(); - } - } - - /** - * simple random test like testRandom2, but for larger docs - */ - public void testRandomHuge() throws Exception { - final int numIters = atLeast(3); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - //if (VERBOSE) { - //System.out.println("TEST: iter=" + i + " numEntries=" + numEntries); - //} - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - Analyzer analyzer = getAnalyzer(b, ignoreCase); - checkRandomData(random(), analyzer, 100, 1024); - analyzer.close(); - } - } - - public void testEmptyTerm() throws IOException { - final int numIters = atLeast(10); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - final Analyzer analyzer = getAnalyzer(b, ignoreCase); - - checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); - analyzer.close(); - } - } - - public void testBuilderDedup() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = false; - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "a b", new String[]{"ab"}, new int[]{1}); - a.close(); - } - - public void testBuilderNoDedup() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(false); - final boolean keepOrig = false; - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "a b", new String[]{"ab", "ab", "ab"}, new int[]{1, 0, 0}); - a.close(); - } - - public void testRecursion1() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = false; - add(b, "zoo", "zoo", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "zoo zoo $ zoo", new String[]{"zoo", "zoo", "$", "zoo"}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testRecursion2() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = false; - add(b, "zoo", "zoo", keepOrig); - add(b, "zoo", "zoo zoo", keepOrig); - Analyzer a = getAnalyzer(b, true); - - // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); - assertAnalyzesTo(a, "zoo zoo $ zoo", new String[]{"zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo"}, new - int[]{1, 0, 1, 1, 0, 1, 1, 1, 0, 1}); - a.close(); - } - - public void testKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = true; - add(b, "a b", "ab", keepOrig); - add(b, "a c", "ac", keepOrig); - add(b, "a", "aa", keepOrig); - add(b, "b", "bb", keepOrig); - add(b, "z x c v", "zxcv", keepOrig); - add(b, "x c", "xc", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "$", new String[]{"$"}, new int[]{1}); - assertAnalyzesTo(a, "a", new String[]{"aa", "a"}, new int[]{1, 0}); - assertAnalyzesTo(a, "a", new String[]{"aa", "a"}, new int[]{1, 0}); - assertAnalyzesTo(a, "$ a", new String[]{"$", "aa", "a"}, new int[]{1, 1, 0}); - assertAnalyzesTo(a, "a $", new String[]{"aa", "a", "$"}, new int[]{1, 0, 1}); - assertAnalyzesTo(a, "$ a !", new String[]{"$", "aa", "a", "!"}, new int[]{1, 1, 0, 1}); - assertAnalyzesTo(a, "a a", new String[]{"aa", "a", "aa", "a"}, new int[]{1, 0, 1, 0}); - assertAnalyzesTo(a, "b", new String[]{"bb", "b"}, new int[]{1, 0}); - assertAnalyzesTo(a, "z x c v", new String[]{"zxcv", "z", "x", "c", "v"}, new int[]{1, 0, 1, 1, 1}); - assertAnalyzesTo(a, "z x c $", new String[]{"z", "xc", "x", "c", "$"}, new int[]{1, 1, 0, 1, 1}); - a.close(); - } - - private Analyzer getAnalyzer(SynonymMap.Builder b, final boolean ignoreCase) throws IOException { - final SynonymMap map = b.build(); - return new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - // Make a local variable so testRandomHuge doesn't share it across threads! - SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); - SynonymGraphFilterTests.this.synFilter = synFilter; - return new TokenStreamComponents(tokenizer, synFilter); - } - }; - } - - private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) { - if (VERBOSE) { - //System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); - } - CharsRefBuilder inputCharsRef = new CharsRefBuilder(); - SynonymMap.Builder.join(input.split(" +"), inputCharsRef); - - CharsRefBuilder outputCharsRef = new CharsRefBuilder(); - SynonymMap.Builder.join(output.split(" +"), outputCharsRef); - - b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig); - } - - private char[] randomBinaryChars(int minLen, int maxLen, double bias, char base) { - int len = TestUtil.nextInt(random(), minLen, maxLen); - char[] chars = new char[len]; - for (int i = 0; i < len; i++) { - char ch; - if (random().nextDouble() < bias) { - ch = base; - } else { - ch = (char) (base + 1); - } - chars[i] = ch; - } - - return chars; - } - - private static String toTokenString(char[] chars) { - StringBuilder b = new StringBuilder(); - for (char c : chars) { - if (b.length() > 0) { - b.append(' '); - } - b.append(c); - } - return b.toString(); - } - - private static class OneSyn { - char[] in; - char[] out; - boolean keepOrig; - - @Override - public String toString() { - return toTokenString(in) + " --> " + toTokenString(out) + " (keepOrig=" + keepOrig + ")"; - } - } - - public void testRandomSyns() throws Exception { - int synCount = atLeast(10); - double bias = random().nextDouble(); - boolean dedup = random().nextBoolean(); - - SynonymMap.Builder b = new SynonymMap.Builder(dedup); - List syns = new ArrayList<>(); - // Makes random syns from random a / b tokens, mapping to random x / y tokens - //if (VERBOSE) { - // System.out.println("TEST: make " + synCount + " syns"); - // System.out.println(" bias for a over b=" + bias); - // System.out.println(" dedup=" + dedup); - // System.out.println(" sausage=" + sausage); - //} - - int maxSynLength = 0; - - for (int i = 0; i < synCount; i++) { - OneSyn syn = new OneSyn(); - syn.in = randomBinaryChars(1, 5, bias, 'a'); - syn.out = randomBinaryChars(1, 5, 0.5, 'x'); - syn.keepOrig = random().nextBoolean(); - syns.add(syn); - - maxSynLength = Math.max(maxSynLength, syn.in.length); - - //if (VERBOSE) { - // System.out.println(" " + syn); - //} - add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig); - } - - // Only used w/ VERBOSE: - Analyzer aNoSausageed; - if (VERBOSE) { - aNoSausageed = getAnalyzer(b, true); - } else { - aNoSausageed = null; - } - - Analyzer a = getAnalyzer(b, true); - int iters = atLeast(20); - for (int iter = 0; iter < iters; iter++) { - - String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a')); - //String doc = toTokenString(randomBinaryChars(10, 50, bias, 'a')); - - //if (VERBOSE) { - // System.out.println("TEST: iter=" + iter + " doc=" + doc); - //} - Automaton expected = slowSynFilter(doc, syns); - if (VERBOSE) { - //System.out.println(" expected:\n" + expected.toDot()); - } - Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc))); - //if (VERBOSE) { - // System.out.println(" actual:\n" + actual.toDot()); - //} - - assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter - .getMaxLookaheadUsed() <= maxSynLength); - - checkAnalysisConsistency(random(), a, random().nextBoolean(), doc); - // We can easily have a non-deterministic automaton at this point, e.g. if - // more than one syn matched at given point, or if the syn mapped to an - // output token that also happens to be in the input: - try { - actual = Operations.determinize(actual, 50000); - } catch (TooComplexToDeterminizeException tctde) { - // Unfortunately the syns can easily create difficult-to-determinize graphs: - assertTrue(approxEquals(actual, expected)); - continue; - } - - try { - expected = Operations.determinize(expected, 50000); - } catch (TooComplexToDeterminizeException tctde) { - // Unfortunately the syns can easily create difficult-to-determinize graphs: - assertTrue(approxEquals(actual, expected)); - continue; - } - - assertTrue(approxEquals(actual, expected)); - assertTrue(Operations.sameLanguage(actual, expected)); - } - - a.close(); - } - - /** - * Only used when true equality is too costly to check! - */ - private boolean approxEquals(Automaton actual, Automaton expected) { - // Don't collapse these into one line else the thread stack won't say which direction failed!: - boolean b1 = approxSubsetOf(actual, expected); - boolean b2 = approxSubsetOf(expected, actual); - return b1 && b2; - } - - private boolean approxSubsetOf(Automaton a1, Automaton a2) { - AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1); - for (int i = 0; i < 2000; i++) { - int[] ints = ras.getRandomAcceptedString(random()); - IntsRef path = new IntsRef(ints, 0, ints.length); - if (accepts(a2, path) == false) { - throw new RuntimeException("a2 does not accept " + path); - } - } - - // Presumed true - return true; - } - - /** - * Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic. - */ - private static boolean accepts(Automaton a, IntsRef path) { - Set states = new HashSet<>(); - states.add(0); - Transition t = new Transition(); - for (int i = 0; i < path.length; i++) { - int digit = path.ints[path.offset + i]; - Set nextStates = new HashSet<>(); - for (int state : states) { - int count = a.initTransition(state, t); - for (int j = 0; j < count; j++) { - a.getNextTransition(t); - if (digit >= t.min && digit <= t.max) { - nextStates.add(t.dest); - } - } - } - states = nextStates; - if (states.isEmpty()) { - return false; - } - } - - for (int state : states) { - if (a.isAccept(state)) { - return true; - } - } - - return false; - } - - /** - * Stupid, slow brute-force, yet hopefully bug-free, synonym filter. - */ - private Automaton slowSynFilter(String doc, List syns) { - String[] tokens = doc.split(" +"); - //if (VERBOSE) { - // System.out.println(" doc has " + tokens.length + " tokens"); - //} - int i = 0; - Automaton.Builder a = new Automaton.Builder(); - int lastState = a.createState(); - while (i < tokens.length) { - // Consider all possible syn matches starting at this point: - assert tokens[i].length() == 1; - //if (VERBOSE) { - // System.out.println(" i=" + i); - //} - - List matches = new ArrayList<>(); - for (OneSyn syn : syns) { - if (i + syn.in.length <= tokens.length) { - boolean match = true; - for (int j = 0; j < syn.in.length; j++) { - if (tokens[i + j].charAt(0) != syn.in[j]) { - match = false; - break; - } - } - - if (match) { - if (matches.isEmpty() == false) { - if (syn.in.length < matches.get(0).in.length) { - // Greedy matching: we already found longer syns matching here - continue; - } else if (syn.in.length > matches.get(0).in.length) { - // Greedy matching: all previous matches were shorter, so we drop them - matches.clear(); - } else { - // Keep the current matches: we allow multiple synonyms matching the same input string - } - } - - matches.add(syn); - } - } - } - - int nextState = a.createState(); - - if (matches.isEmpty() == false) { - // We have match(es) starting at this token - //if (VERBOSE) { - // System.out.println(" matches @ i=" + i + ": " + matches); - //} - // We keepOrig if any of the matches said to: - boolean keepOrig = false; - for (OneSyn syn : matches) { - keepOrig |= syn.keepOrig; - } - - if (keepOrig) { - // Add path for the original tokens - addSidePath(a, lastState, nextState, matches.get(0).in); - } - - for (OneSyn syn : matches) { - addSidePath(a, lastState, nextState, syn.out); - } - - i += matches.get(0).in.length; - } else { - a.addTransition(lastState, nextState, tokens[i].charAt(0)); - i++; - } - - lastState = nextState; - } - - a.setAccept(lastState, true); - - return topoSort(a.finish()); - } - - /** - * Just creates a side path from startState to endState with the provided tokens. - */ - private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens) { - int lastState = startState; - for (int i = 0; i < tokens.length; i++) { - int nextState; - if (i == tokens.length - 1) { - nextState = endState; - } else { - nextState = a.createState(); - } - - a.addTransition(lastState, nextState, tokens[i]); - - lastState = nextState; - } - } - - private Automaton toAutomaton(TokenStream ts) throws IOException { - PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); - PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - Automaton a = new Automaton(); - int srcNode = -1; - int destNode = -1; - int state = a.createState(); - while (ts.incrementToken()) { - assert termAtt.length() == 1; - char c = termAtt.charAt(0); - int posInc = posIncAtt.getPositionIncrement(); - if (posInc != 0) { - srcNode += posInc; - while (state < srcNode) { - state = a.createState(); - } - } - destNode = srcNode + posLenAtt.getPositionLength(); - while (state < destNode) { - state = a.createState(); - } - a.addTransition(srcNode, destNode, c); - } - ts.end(); - ts.close(); - a.finishState(); - a.setAccept(destNode, true); - return a; - } - - /** - * Renumbers nodes according to their topo sort - */ - private Automaton topoSort(Automaton in) { - int[] newToOld = Operations.topoSortStates(in); - int[] oldToNew = new int[newToOld.length]; - - Automaton.Builder a = new Automaton.Builder(); - //System.out.println("remap:"); - for (int i = 0; i < newToOld.length; i++) { - a.createState(); - oldToNew[newToOld[i]] = i; - //System.out.println(" " + newToOld[i] + " -> " + i); - if (in.isAccept(newToOld[i])) { - a.setAccept(i, true); - //System.out.println(" **"); - } - } - - Transition t = new Transition(); - for (int i = 0; i < newToOld.length; i++) { - int count = in.initTransition(newToOld[i], t); - for (int j = 0; j < count; j++) { - in.getNextTransition(t); - a.addTransition(i, oldToNew[t.dest], t.min, t.max); - } - } - - return a.finish(); - } - - /** - * Helper method to validate all strings that can be generated from a token stream. Uses {@link - * TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton - * are all and only the given valid strings. - * - * @param analyzer analyzer containing the SynonymFilter under test. - * @param text text to be analyzed. - * @param expectedStrings all expected finite strings. - */ - public void assertAllStrings(Analyzer analyzer, String text, String[] expectedStrings) throws IOException { - TokenStream tokenStream = analyzer.tokenStream("dummy", text); - try { - Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream); - Set finiteStrings = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1); - - assertEquals("Invalid resulting strings count. Expected " + expectedStrings.length + " was " + finiteStrings.size(), - expectedStrings.length, finiteStrings.size()); - - Set expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings)); - - BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder(); - for (IntsRef ir : finiteStrings) { - String s = Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '); - assertTrue("Unexpected string found: " + s, expectedStringsSet.contains(s)); - } - } finally { - tokenStream.close(); - } - } -} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java new file mode 100644 index 00000000000..713e9424759 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -0,0 +1,146 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.StringReader; + +/** + * Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory} + */ +public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase { + final String type; + + public BaseWordDelimiterTokenFilterFactoryTestCase(String type) { + this.type = type; + } + + public void testDefault() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", + "fi", "4000", "j", "2", "se", "O", "Neil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testCatenateWords() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testCatenateNumbers() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") + .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", + "se", "O", "Neil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testCatenateAll() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") + .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") + .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testSplitOnCaseChange() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + String[] expected = new String[]{"PowerShot"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testPreserveOriginal() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", + "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testStemEnglishPossessive() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", + "se", "O", "Neil", "s"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java new file mode 100644 index 00000000000..2ae4267104a --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java @@ -0,0 +1,75 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.io.StringReader; + +public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { + public WordDelimiterGraphTokenFilterFactoryTests() { + super("word_delimiter_graph"); + } + + public void testMultiTerms() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") + .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") + .build()); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42", + "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se", + "ONeil", "O'Neil's", "O", "Neil" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}; + int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1}; + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, + expectedIncr, expectedPosLen, null); + } + + /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ + public void testPartsAndCatenate() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + int[] expectedIncr = new int[]{1, 0, 1}; + int[] expectedPosLen = new int[]{2, 1, 1}; + String[] expected = new String[]{"PowerShot", "Power", "Shot" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, + expectedIncr, expectedPosLen, null); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java index 1a7903bcfac..1e919e00bbb 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java @@ -24,121 +24,23 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.test.ESTestCase; -import org.elasticsearch.test.ESTokenStreamTestCase; import java.io.IOException; import java.io.StringReader; -public class WordDelimiterTokenFilterFactoryTests extends ESTokenStreamTestCase { - public void testDefault() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testCatenateWords() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testCatenateNumbers() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") - .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testCatenateAll() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") - .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") - .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testSplitOnCaseChange() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot"; - String[] expected = new String[]{"PowerShot"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testPreserveOriginal() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testStemEnglishPossessive() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); +public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { + public WordDelimiterTokenFilterFactoryTests() { + super("word_delimiter"); } /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ public void testPartsAndCatenate() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") - .build()); + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; String[] expected = new String[]{"Power", "PowerShot", "Shot" }; diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java index 944427b7e17..e33b201bf22 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java @@ -47,6 +47,7 @@ import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.search.SearchModule; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions; @@ -288,6 +289,7 @@ public class HighlightBuilderTests extends ESTestCase { mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions); checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars); + checkSame.accept(AbstractHighlighterBuilder::boundaryScannerType, FieldOptions::boundaryScannerType); checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan); checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize); checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter); @@ -557,12 +559,23 @@ public class HighlightBuilderTests extends ESTestCase { if (randomBoolean()) { highlightBuilder.forceSource(randomBoolean()); } + if (randomBoolean()) { + if (randomBoolean()) { + highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values())); + } else { + // also test the string setter + highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()).toString()); + } + } if (randomBoolean()) { highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10)); } if (randomBoolean()) { highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray()); } + if (randomBoolean()) { + highlightBuilder.boundaryScannerLocale(randomLocale(random()).toLanguageTag()); + } if (randomBoolean()) { highlightBuilder.noMatchSize(randomIntBetween(0, 10)); } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 815998ad093..7db99ff3232 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -44,6 +44,7 @@ import org.elasticsearch.plugins.Plugin; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESIntegTestCase; @@ -57,6 +58,7 @@ import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import static org.elasticsearch.client.Requests.searchRequest; @@ -747,7 +749,94 @@ public class HighlighterSearchIT extends ESIntegTestCase { searchResponse = client().prepareSearch("test").setSource(source).get(); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over")); + } + public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "A sentence with few words. Another sentence with even more words.")); + + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "sentence")) + .highlighter(highlight() + .field("field1", 20, 2) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE)); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + } + + public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "A sentence with few words. Another sentence with even more words.")); + + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "sentence")) + .highlighter(highlight() + .field("field1", 20, 2) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + } + + public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); + + logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD)); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + } + + public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); + + logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); } /** diff --git a/docs/build.gradle b/docs/build.gradle index 36727b12e50..9fd593e2fae 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -81,6 +81,7 @@ buildRestTests.expectedUnconvertedCandidates = [ 'reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc', + 'reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc', 'reference/cat/snapshots.asciidoc', 'reference/cat/templates.asciidoc', 'reference/cat/thread_pool.asciidoc', diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc new file mode 100644 index 00000000000..01176fa5636 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -0,0 +1,97 @@ +[[analysis-word-delimiter-graph-tokenfilter]] +=== Word Delimiter Graph Token Filter + +experimental[] + +Named `word_delimiter_graph`, it splits words into subwords and performs +optional transformations on subword groups. Words are split into +subwords with the following rules: + +* split on intra-word delimiters (by default, all non alpha-numeric +characters). +* "Wi-Fi" -> "Wi", "Fi" +* split on case transitions: "PowerShot" -> "Power", "Shot" +* split on letter-number transitions: "SD500" -> "SD", "500" +* leading and trailing intra-word delimiters on each subword are +ignored: "//hello---there, 'dude'" -> "hello", "there", "dude" +* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil" + +Unlike the `word_delimiter`, this token filter correctly handles positions for +multi terms expansion at search-time when any of the following options +are set to true: + + * `preserve_original` + * `catenate_numbers` + * `catenate_words` + * `catenate_all` + +Parameters include: + +`generate_word_parts`:: + If `true` causes parts of words to be + generated: "PowerShot" => "Power" "Shot". Defaults to `true`. + +`generate_number_parts`:: + If `true` causes number subwords to be + generated: "500-42" => "500" "42". Defaults to `true`. + +`catenate_words`:: + If `true` causes maximum runs of word parts to be + catenated: "wi-fi" => "wifi". Defaults to `false`. + +`catenate_numbers`:: + If `true` causes maximum runs of number parts to + be catenated: "500-42" => "50042". Defaults to `false`. + +`catenate_all`:: + If `true` causes all subword parts to be catenated: + "wi-fi-4000" => "wifi4000". Defaults to `false`. + +`split_on_case_change`:: + If `true` causes "PowerShot" to be two tokens; + ("Power-Shot" remains two parts regards). Defaults to `true`. + +`preserve_original`:: + If `true` includes original words in subwords: + "500-42" => "500-42" "500" "42". Defaults to `false`. + +`split_on_numerics`:: + If `true` causes "j2se" to be three tokens; "j" + "2" "se". Defaults to `true`. + +`stem_english_possessive`:: + If `true` causes trailing "'s" to be + removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`. + +Advance settings include: + +`protected_words`:: + A list of protected words from being delimiter. + Either an array, or also can set `protected_words_path` which resolved + to a file configured with protected words (one on each line). + Automatically resolves to `config/` based location if exists. + +`type_table`:: + A custom type mapping table, for example (when configured + using `type_table_path`): + +[source,js] +-------------------------------------------------- + # Map the $, %, '.', and ',' characters to DIGIT + # This might be useful for financial data. + $ => DIGIT + % => DIGIT + . => DIGIT + \\u002C => DIGIT + + # in some cases you might not want to split on ZWJ + # this also tests the case where we need a bigger byte[] + # see http://en.wikipedia.org/wiki/Zero-width_joiner + \\u200D => ALPHANUM +-------------------------------------------------- + +NOTE: Using a tokenizer like the `standard` tokenizer may interfere with +the `catenate_*` and `preserve_original` parameters, as the original +string may already have lost punctuation during tokenization. Instead, +you may want to use the `whitespace` tokenizer. + diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc index 30c0e20d5bf..81f454bb158 100644 --- a/docs/reference/search/request/highlighting.asciidoc +++ b/docs/reference/search/request/highlighting.asciidoc @@ -103,8 +103,7 @@ If `term_vector` information is provided by setting `term_vector` to will be used instead of the plain highlighter. The fast vector highlighter: * Is faster especially for large fields (> `1MB`) -* Can be customized with `boundary_chars`, `boundary_max_scan`, and - `fragment_offset` (see <>) +* Can be customized with `boundary_scanner` (see <>) * Requires setting `term_vector` to `with_positions_offsets` which increases the size of the index * Can combine matches from multiple fields into one result. See @@ -502,17 +501,23 @@ GET /_search -------------------------------------------------- // CONSOLE -[[boundary-characters]] -==== Boundary Characters +[[boundary-scanners]] +==== Boundary Scanners -When highlighting a field using the fast vector highlighter, -`boundary_chars` can be configured to define what constitutes a boundary -for highlighting. It's a single string with each boundary character -defined in it. It defaults to `.,!? \t\n`. +When highlighting a field using the fast vector highlighter, you can specify +how to break the highlighted fragments using `boundary_scanner`, which accepts +the following values: -The `boundary_max_scan` allows to control how far to look for boundary -characters, and defaults to `20`. +* `chars` (default): allows to configure which characters (`boundary_chars`) +constitute a boundary for highlighting. It's a single string with each boundary +character defined in it (defaults to `.,!? \t\n`). It also allows configuring +the `boundary_max_scan` to control how far to look for boundary characters +(defaults to `20`). +* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] +to break the highlighted fragments at the next _word_ or _sentence_ boundary. +You can further specify `boundary_scanner_locale` to control which Locale is used +to search the text for these boundaries. [[matched-fields]] ==== Matched Fields diff --git a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/DatabaseReaderLazyLoader.java b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/DatabaseReaderLazyLoader.java new file mode 100644 index 00000000000..f73d2ca13c1 --- /dev/null +++ b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/DatabaseReaderLazyLoader.java @@ -0,0 +1,62 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.ingest.geoip; + +import com.maxmind.geoip2.DatabaseReader; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.SetOnce; +import org.elasticsearch.common.CheckedSupplier; +import org.elasticsearch.common.logging.Loggers; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Facilitates lazy loading of the database reader, so that when the geoip plugin is installed, but not used, + * no memory is being wasted on the database reader. + */ +final class DatabaseReaderLazyLoader implements Closeable { + + private static final Logger LOGGER = Loggers.getLogger(DatabaseReaderLazyLoader.class); + + private final String databaseFileName; + private final CheckedSupplier loader; + // package protected for testing only: + final SetOnce databaseReader; + + DatabaseReaderLazyLoader(String databaseFileName, CheckedSupplier loader) { + this.databaseFileName = databaseFileName; + this.loader = loader; + this.databaseReader = new SetOnce<>(); + } + + synchronized DatabaseReader get() throws IOException { + if (databaseReader.get() == null) { + databaseReader.set(loader.get()); + LOGGER.debug("Loaded [{}] geoip database", databaseFileName); + } + return databaseReader.get(); + } + + @Override + public synchronized void close() throws IOException { + IOUtils.close(databaseReader.get()); + } +} diff --git a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/GeoIpProcessor.java b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/GeoIpProcessor.java index 3d1418dc940..2cbaa7a3bb1 100644 --- a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/GeoIpProcessor.java +++ b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/GeoIpProcessor.java @@ -19,19 +19,6 @@ package org.elasticsearch.ingest.geoip; -import java.io.IOException; -import java.net.InetAddress; -import java.security.AccessController; -import java.security.PrivilegedAction; -import java.util.Arrays; -import java.util.Collections; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - import com.maxmind.geoip2.DatabaseReader; import com.maxmind.geoip2.exception.AddressNotFoundException; import com.maxmind.geoip2.model.CityResponse; @@ -49,6 +36,19 @@ import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; +import java.io.IOException; +import java.net.InetAddress; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + import static org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException; import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty; import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList; @@ -264,9 +264,9 @@ public final class GeoIpProcessor extends AbstractProcessor { ); static final Set DEFAULT_COUNTRY_PROPERTIES = EnumSet.of(Property.CONTINENT_NAME, Property.COUNTRY_ISO_CODE); - private final Map databaseReaders; + private final Map databaseReaders; - public Factory(Map databaseReaders) { + public Factory(Map databaseReaders) { this.databaseReaders = databaseReaders; } @@ -279,12 +279,13 @@ public final class GeoIpProcessor extends AbstractProcessor { List propertyNames = readOptionalList(TYPE, processorTag, config, "properties"); boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); - DatabaseReader databaseReader = databaseReaders.get(databaseFile); - if (databaseReader == null) { + DatabaseReaderLazyLoader lazyLoader = databaseReaders.get(databaseFile); + if (lazyLoader == null) { throw newConfigurationException(TYPE, processorTag, "database_file", "database file [" + databaseFile + "] doesn't exist"); } + DatabaseReader databaseReader = lazyLoader.get(); String databaseType = databaseReader.getMetadata().getDatabaseType(); final Set properties; diff --git a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/IngestGeoIpPlugin.java b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/IngestGeoIpPlugin.java index 4e5cc5c0237..1571bc99ea4 100644 --- a/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/IngestGeoIpPlugin.java +++ b/plugins/ingest-geoip/src/main/java/org/elasticsearch/ingest/geoip/IngestGeoIpPlugin.java @@ -19,6 +19,15 @@ package org.elasticsearch.ingest.geoip; +import com.maxmind.db.NoCache; +import com.maxmind.db.NodeCache; +import com.maxmind.geoip2.DatabaseReader; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.ingest.Processor; +import org.elasticsearch.plugins.IngestPlugin; +import org.elasticsearch.plugins.Plugin; + import java.io.Closeable; import java.io.IOException; import java.io.InputStream; @@ -35,20 +44,11 @@ import java.util.Map; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; -import com.maxmind.db.NoCache; -import com.maxmind.db.NodeCache; -import com.maxmind.geoip2.DatabaseReader; -import org.apache.lucene.util.IOUtils; -import org.elasticsearch.common.settings.Setting; -import org.elasticsearch.ingest.Processor; -import org.elasticsearch.plugins.IngestPlugin; -import org.elasticsearch.plugins.Plugin; - public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable { public static final Setting CACHE_SIZE = Setting.longSetting("ingest.geoip.cache_size", 1000, 0, Setting.Property.NodeScope); - private Map databaseReaders; + private Map databaseReaders; @Override public List> getSettings() { @@ -76,12 +76,12 @@ public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable return Collections.singletonMap(GeoIpProcessor.TYPE, new GeoIpProcessor.Factory(databaseReaders)); } - static Map loadDatabaseReaders(Path geoIpConfigDirectory, NodeCache cache) throws IOException { + static Map loadDatabaseReaders(Path geoIpConfigDirectory, NodeCache cache) throws IOException { if (Files.exists(geoIpConfigDirectory) == false && Files.isDirectory(geoIpConfigDirectory)) { throw new IllegalStateException("the geoip directory [" + geoIpConfigDirectory + "] containing databases doesn't exist"); } - Map databaseReaders = new HashMap<>(); + Map databaseReaders = new HashMap<>(); try (Stream databaseFiles = Files.list(geoIpConfigDirectory)) { PathMatcher pathMatcher = geoIpConfigDirectory.getFileSystem().getPathMatcher("glob:**.mmdb.gz"); // Use iterator instead of forEach otherwise IOException needs to be caught twice... @@ -89,10 +89,13 @@ public class IngestGeoIpPlugin extends Plugin implements IngestPlugin, Closeable while (iterator.hasNext()) { Path databasePath = iterator.next(); if (Files.isRegularFile(databasePath) && pathMatcher.matches(databasePath)) { - try (InputStream inputStream = new GZIPInputStream(Files.newInputStream(databasePath, StandardOpenOption.READ))) { - databaseReaders.put(databasePath.getFileName().toString(), - new DatabaseReader.Builder(inputStream).withCache(cache).build()); - } + String databaseFileName = databasePath.getFileName().toString(); + DatabaseReaderLazyLoader holder = new DatabaseReaderLazyLoader(databaseFileName, () -> { + try (InputStream inputStream = new GZIPInputStream(Files.newInputStream(databasePath, StandardOpenOption.READ))) { + return new DatabaseReader.Builder(inputStream).withCache(cache).build(); + } + }); + databaseReaders.put(databaseFileName, holder); } } } diff --git a/plugins/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpProcessorFactoryTests.java b/plugins/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpProcessorFactoryTests.java index 0c80bcc71fd..8db0d15f796 100644 --- a/plugins/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpProcessorFactoryTests.java +++ b/plugins/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpProcessorFactoryTests.java @@ -22,7 +22,6 @@ package org.elasticsearch.ingest.geoip; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.maxmind.db.NoCache; import com.maxmind.db.NodeCache; -import com.maxmind.geoip2.DatabaseReader; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.Randomness; import org.elasticsearch.test.ESTestCase; @@ -48,7 +47,7 @@ import static org.hamcrest.Matchers.sameInstance; public class GeoIpProcessorFactoryTests extends ESTestCase { - private static Map databaseReaders; + private static Map databaseReaders; @BeforeClass public static void loadDatabaseReaders() throws IOException { @@ -66,7 +65,7 @@ public class GeoIpProcessorFactoryTests extends ESTestCase { @AfterClass public static void closeDatabaseReaders() throws IOException { - for (DatabaseReader reader : databaseReaders.values()) { + for (DatabaseReaderLazyLoader reader : databaseReaders.values()) { reader.close(); } databaseReaders = null; @@ -222,4 +221,37 @@ public class GeoIpProcessorFactoryTests extends ESTestCase { assertThat(e.getMessage(), equalTo("[properties] property isn't a list, but of type [java.lang.String]")); } } + + public void testLazyLoading() throws Exception { + Path configDir = createTempDir(); + Path geoIpConfigDir = configDir.resolve("ingest-geoip"); + Files.createDirectories(geoIpConfigDir); + Files.copy(new ByteArrayInputStream(StreamsUtils.copyToBytesFromClasspath("/GeoLite2-City.mmdb.gz")), + geoIpConfigDir.resolve("GeoLite2-City.mmdb.gz")); + Files.copy(new ByteArrayInputStream(StreamsUtils.copyToBytesFromClasspath("/GeoLite2-Country.mmdb.gz")), + geoIpConfigDir.resolve("GeoLite2-Country.mmdb.gz")); + + // Loading another database reader instances, because otherwise we can't test lazy loading as the the + // database readers used at class level are reused between tests. (we want to keep that otherwise running this + // test will take roughly 4 times more time) + Map databaseReaders = + IngestGeoIpPlugin.loadDatabaseReaders(geoIpConfigDir, NoCache.getInstance()); + GeoIpProcessor.Factory factory = new GeoIpProcessor.Factory(databaseReaders); + for (DatabaseReaderLazyLoader lazyLoader : databaseReaders.values()) { + assertNull(lazyLoader.databaseReader.get()); + } + + Map config = new HashMap<>(); + config.put("field", "_field"); + config.put("database_file", "GeoLite2-City.mmdb.gz"); + factory.create(null, "_tag", config); + config = new HashMap<>(); + config.put("field", "_field"); + config.put("database_file", "GeoLite2-Country.mmdb.gz"); + factory.create(null, "_tag", config); + + for (DatabaseReaderLazyLoader lazyLoader : databaseReaders.values()) { + assertNotNull(lazyLoader.databaseReader.get()); + } + } } diff --git a/plugins/repository-s3/src/main/java/org/elasticsearch/cloud/aws/InternalAwsS3Service.java b/plugins/repository-s3/src/main/java/org/elasticsearch/cloud/aws/InternalAwsS3Service.java index ce47bd44f0b..cc5d69d61c7 100644 --- a/plugins/repository-s3/src/main/java/org/elasticsearch/cloud/aws/InternalAwsS3Service.java +++ b/plugins/repository-s3/src/main/java/org/elasticsearch/cloud/aws/InternalAwsS3Service.java @@ -150,18 +150,7 @@ public class InternalAwsS3Service extends AbstractLifecycleComponent implements if (key.length() == 0 && secret.length() == 0) { logger.debug("Using instance profile credentials"); - AWSCredentialsProvider credentials = new InstanceProfileCredentialsProvider(); - return new AWSCredentialsProvider() { - @Override - public AWSCredentials getCredentials() { - return SocketAccess.doPrivileged(credentials::getCredentials); - } - - @Override - public void refresh() { - SocketAccess.doPrivilegedVoid(credentials::refresh); - } - }; + return new PrivilegedInstanceProfileCredentialsProvider(); } else { logger.debug("Using basic key/secret credentials"); return new StaticCredentialsProvider(new BasicAWSCredentials(key.toString(), secret.toString())); @@ -221,4 +210,22 @@ public class InternalAwsS3Service extends AbstractLifecycleComponent implements // Ensure that IdleConnectionReaper is shutdown IdleConnectionReaper.shutdown(); } + + static class PrivilegedInstanceProfileCredentialsProvider implements AWSCredentialsProvider { + private final InstanceProfileCredentialsProvider credentials; + + private PrivilegedInstanceProfileCredentialsProvider() { + this.credentials = new InstanceProfileCredentialsProvider(); + } + + @Override + public AWSCredentials getCredentials() { + return SocketAccess.doPrivileged(credentials::getCredentials); + } + + @Override + public void refresh() { + SocketAccess.doPrivilegedVoid(credentials::refresh); + } + } } diff --git a/plugins/repository-s3/src/test/java/org/elasticsearch/cloud/aws/AwsS3ServiceImplTests.java b/plugins/repository-s3/src/test/java/org/elasticsearch/cloud/aws/AwsS3ServiceImplTests.java index 73252102c2f..09a3222d63e 100644 --- a/plugins/repository-s3/src/test/java/org/elasticsearch/cloud/aws/AwsS3ServiceImplTests.java +++ b/plugins/repository-s3/src/test/java/org/elasticsearch/cloud/aws/AwsS3ServiceImplTests.java @@ -37,7 +37,7 @@ public class AwsS3ServiceImplTests extends ESTestCase { public void testAWSCredentialsWithSystemProviders() { AWSCredentialsProvider credentialsProvider = InternalAwsS3Service.buildCredentials(logger, deprecationLogger, Settings.EMPTY, Settings.EMPTY, "default"); - assertThat(credentialsProvider, instanceOf(AWSCredentialsProvider.class)); + assertThat(credentialsProvider, instanceOf(InternalAwsS3Service.PrivilegedInstanceProfileCredentialsProvider.class)); } public void testAwsCredsDefaultSettings() { diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yaml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yaml index 72f782e68d1..644e8c4ec5a 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yaml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yaml @@ -28,7 +28,7 @@ setup: --- "Basic": - skip: - version: " - 5.2.99" + version: " - 5.99.99" reason: this uses a new highlighter that has been added in 5.3 - do: search: