diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index ffe80a0f5f4..d49edb33eb3 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -278,22 +278,6 @@ public final class AnalysisModule { * version uses a set of English stop words that are in * lucene-analyzers-common so "stop" is defined in the analysis-common * module. */ - - // Add token filters declared in PreBuiltTokenFilters until they have all been migrated - for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { - switch (preBuilt) { - case LOWERCASE: - // This has been migrated but has to stick around until PreBuiltTokenizers is removed. - continue; - default: - if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) { - throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy()); - } - String name = preBuilt.name().toLowerCase(Locale.ROOT); - preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(), - tokenStream -> preBuilt.create(tokenStream, Version.CURRENT))); - } - } for (AnalysisPlugin plugin: plugins) { for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) { diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 427c0431fb5..ba66c41e639 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -20,38 +20,10 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; -import org.apache.lucene.analysis.ar.ArabicStemFilter; -import org.apache.lucene.analysis.br.BrazilianStemFilter; -import org.apache.lucene.analysis.cjk.CJKBigramFilter; -import org.apache.lucene.analysis.cjk.CJKWidthFilter; -import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; -import org.apache.lucene.analysis.core.DecimalDigitFilter; -import org.apache.lucene.analysis.cz.CzechStemFilter; -import org.apache.lucene.analysis.de.GermanNormalizationFilter; -import org.apache.lucene.analysis.de.GermanStemFilter; -import org.apache.lucene.analysis.fa.PersianNormalizationFilter; -import org.apache.lucene.analysis.fr.FrenchAnalyzer; -import org.apache.lucene.analysis.hi.HindiNormalizationFilter; -import org.apache.lucene.analysis.in.IndicNormalizationFilter; -import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; -import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; -import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; -import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; -import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; -import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; -import org.apache.lucene.analysis.shingle.ShingleFilter; -import org.apache.lucene.analysis.snowball.SnowballFilter; -import org.apache.lucene.analysis.tr.ApostropheFilter; -import org.apache.lucene.analysis.util.ElisionFilter; import org.elasticsearch.Version; -import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; -import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; -import org.tartarus.snowball.ext.DutchStemmer; -import org.tartarus.snowball.ext.FrenchStemmer; import java.util.Locale; @@ -66,229 +38,7 @@ public enum PreBuiltTokenFilters { protected boolean isMultiTermAware() { return true; } - }, - - // Extended Token Filters - ELISION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - ARABIC_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ArabicStemFilter(tokenStream); - } - }, - - BRAZILIAN_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new BrazilianStemFilter(tokenStream); - } - }, - - CZECH_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new CzechStemFilter(tokenStream); - } - }, - - DUTCH_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new SnowballFilter(tokenStream, new DutchStemmer()); - } - }, - - FRENCH_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new SnowballFilter(tokenStream, new FrenchStemmer()); - } - }, - - GERMAN_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new GermanStemFilter(tokenStream); - } - }, - - RUSSIAN_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new SnowballFilter(tokenStream, "Russian"); - } - }, - - KEYWORD_REPEAT(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new KeywordRepeatFilter(tokenStream); - } - }, - - ARABIC_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ArabicNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - PERSIAN_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new PersianNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - TYPE_AS_PAYLOAD(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new TypeAsPayloadTokenFilter(tokenStream); - } - }, - - SHINGLE(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ShingleFilter(tokenStream); - } - }, - - GERMAN_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new GermanNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - HINDI_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new HindiNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - INDIC_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new IndicNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - SORANI_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new SoraniNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ScandinavianNormalizationFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - SCANDINAVIAN_FOLDING(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ScandinavianFoldingFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - APOSTROPHE(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ApostropheFilter(tokenStream); - } - }, - - CJK_WIDTH(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new CJKWidthFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - DECIMAL_DIGIT(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new DecimalDigitFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - CJK_BIGRAM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new CJKBigramFilter(tokenStream); - } - }, - - DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER); - } - }, - - LIMIT(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS); - } - }, - - ; + }; protected boolean isMultiTermAware() { return false; diff --git a/core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java b/core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java index 05e30d7e2d7..15a2f9e74a4 100644 --- a/core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java +++ b/core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java @@ -66,7 +66,6 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke import static org.hamcrest.Matchers.equalTo; public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase { - protected static class TestFieldSetting { public final String name; public final boolean storedOffset; @@ -211,7 +210,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase { Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.tv_test.tokenizer", "standard") - .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"); + .putArray("index.analysis.analyzer.tv_test.filter", "lowercase"); assertAcked(prepareCreate(index).addMapping("type1", mappingBuilder).setSettings(settings).addAlias(new Alias(alias))); } @@ -395,11 +394,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase { assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1)); assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1)); } - if (field.storedPayloads && testConfig.requestPayloads) { - assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload())); - } else { - assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null)); - } + assertNull("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload()); } } assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next()); diff --git a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsCheckDocFreqIT.java b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsCheckDocFreqIT.java deleted file mode 100644 index 294a0ffde8f..00000000000 --- a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsCheckDocFreqIT.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.action.termvectors; - -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.xcontent.ToXContent; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.test.ESIntegTestCase; -import org.hamcrest.Matchers; - -import java.io.IOException; - -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; -import static org.hamcrest.Matchers.equalTo; - -public class GetTermVectorsCheckDocFreqIT extends ESIntegTestCase { - - @Override - protected int numberOfShards() { - return 1; - } - - @Override - protected int numberOfReplicas() { - return 0; - } - - @Override - public Settings indexSettings() { - return Settings.builder() - .put(super.indexSettings()) - .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase") - .build(); - } - - public void testSimpleTermVectors() throws IOException { - XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1") - .startObject("properties") - .startObject("field") - .field("type", "text") - .field("term_vector", "with_positions_offsets_payloads") - .field("analyzer", "tv_test") - .endObject() - .endObject() - .endObject().endObject(); - assertAcked(prepareCreate("test").addMapping("type1", mapping)); - ensureGreen(); - int numDocs = 15; - for (int i = 0; i < numDocs; i++) { - client().prepareIndex("test", "type1", Integer.toString(i)) - .setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") - // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 - // 31the34 35lazy39 40dog43 - .endObject()).execute().actionGet(); - refresh(); - } - String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; - int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; - int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; - int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; - int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; - for (int i = 0; i < numDocs; i++) { - checkAllInfo(numDocs, values, freq, pos, startOffset, endOffset, i); - checkWithoutTermStatistics(numDocs, values, freq, pos, startOffset, endOffset, i); - checkWithoutFieldStatistics(numDocs, values, freq, pos, startOffset, endOffset, i); - } - } - - private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, - int i) throws IOException { - TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true) - .setPositions(true).setTermStatistics(true).setFieldStatistics(false).setSelectedFields(); - TermVectorsResponse response = resp.execute().actionGet(); - assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); - Fields fields = response.getFields(); - assertThat(fields.size(), equalTo(1)); - Terms terms = fields.terms("field"); - assertThat(terms.size(), equalTo(8L)); - assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1)); - assertThat(terms.getDocCount(), Matchers.equalTo(-1)); - assertThat(terms.getSumDocFreq(), equalTo((long) -1)); - TermsEnum iterator = terms.iterator(); - for (int j = 0; j < values.length; j++) { - String string = values[j]; - BytesRef next = iterator.next(); - assertThat(next, Matchers.notNullValue()); - assertThat("expected " + string, string, equalTo(next.utf8ToString())); - assertThat(next, Matchers.notNullValue()); - if (string.equals("the")) { - assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); - } else { - assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); - } - - PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); - assertThat(docsAndPositions.nextDoc(), equalTo(0)); - assertThat(freq[j], equalTo(docsAndPositions.freq())); - assertThat(iterator.docFreq(), equalTo(numDocs)); - int[] termPos = pos[j]; - int[] termStartOffset = startOffset[j]; - int[] termEndOffset = endOffset[j]; - assertThat(termPos.length, equalTo(freq[j])); - assertThat(termStartOffset.length, equalTo(freq[j])); - assertThat(termEndOffset.length, equalTo(freq[j])); - for (int k = 0; k < freq[j]; k++) { - int nextPosition = docsAndPositions.nextPosition(); - assertThat("term: " + string, nextPosition, equalTo(termPos[k])); - assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); - assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); - assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); - } - } - assertThat(iterator.next(), Matchers.nullValue()); - - XContentBuilder xBuilder = XContentFactory.jsonBuilder(); - response.toXContent(xBuilder, null); - String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");; - String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" - + i - + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; - assertThat(utf8, equalTo(expectedString)); - - } - - private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, - int i) throws IOException { - TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true) - .setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields(); - assertThat(resp.request().termStatistics(), equalTo(false)); - TermVectorsResponse response = resp.execute().actionGet(); - assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); - Fields fields = response.getFields(); - assertThat(fields.size(), equalTo(1)); - Terms terms = fields.terms("field"); - assertThat(terms.size(), equalTo(8L)); - assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); - assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); - assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); - TermsEnum iterator = terms.iterator(); - for (int j = 0; j < values.length; j++) { - String string = values[j]; - BytesRef next = iterator.next(); - assertThat(next, Matchers.notNullValue()); - assertThat("expected " + string, string, equalTo(next.utf8ToString())); - assertThat(next, Matchers.notNullValue()); - - assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq())); - - PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); - assertThat(docsAndPositions.nextDoc(), equalTo(0)); - assertThat(freq[j], equalTo(docsAndPositions.freq())); - assertThat(iterator.docFreq(), equalTo(-1)); - int[] termPos = pos[j]; - int[] termStartOffset = startOffset[j]; - int[] termEndOffset = endOffset[j]; - assertThat(termPos.length, equalTo(freq[j])); - assertThat(termStartOffset.length, equalTo(freq[j])); - assertThat(termEndOffset.length, equalTo(freq[j])); - for (int k = 0; k < freq[j]; k++) { - int nextPosition = docsAndPositions.nextPosition(); - assertThat("term: " + string, nextPosition, equalTo(termPos[k])); - assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); - assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); - assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); - } - } - assertThat(iterator.next(), Matchers.nullValue()); - - XContentBuilder xBuilder = XContentFactory.jsonBuilder(); - response.toXContent(xBuilder, null); - String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");; - String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" - + i - + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; - assertThat(utf8, equalTo(expectedString)); - - } - - private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) - throws IOException { - TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true) - .setPositions(true).setFieldStatistics(true).setTermStatistics(true).setSelectedFields(); - assertThat(resp.request().fieldStatistics(), equalTo(true)); - TermVectorsResponse response = resp.execute().actionGet(); - assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); - Fields fields = response.getFields(); - assertThat(fields.size(), equalTo(1)); - Terms terms = fields.terms("field"); - assertThat(terms.size(), equalTo(8L)); - assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); - assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); - assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); - TermsEnum iterator = terms.iterator(); - for (int j = 0; j < values.length; j++) { - String string = values[j]; - BytesRef next = iterator.next(); - assertThat(next, Matchers.notNullValue()); - assertThat("expected " + string, string, equalTo(next.utf8ToString())); - assertThat(next, Matchers.notNullValue()); - if (string.equals("the")) { - assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); - } else { - assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); - } - - PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); - assertThat(docsAndPositions.nextDoc(), equalTo(0)); - assertThat(freq[j], equalTo(docsAndPositions.freq())); - assertThat(iterator.docFreq(), equalTo(numDocs)); - int[] termPos = pos[j]; - int[] termStartOffset = startOffset[j]; - int[] termEndOffset = endOffset[j]; - assertThat(termPos.length, equalTo(freq[j])); - assertThat(termStartOffset.length, equalTo(freq[j])); - assertThat(termEndOffset.length, equalTo(freq[j])); - for (int k = 0; k < freq[j]; k++) { - int nextPosition = docsAndPositions.nextPosition(); - assertThat("term: " + string, nextPosition, equalTo(termPos[k])); - assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); - assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); - assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); - } - } - assertThat(iterator.next(), Matchers.nullValue()); - - XContentBuilder xBuilder = XContentFactory.jsonBuilder(); - response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); - String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");; - String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" - + i - + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; - assertThat(utf8, equalTo(expectedString)); - } - -} diff --git a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java index ba2f5de24ba..2ab6292f2d2 100644 --- a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java +++ b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java @@ -193,7 +193,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase { .setSettings(Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); + .putArray("index.analysis.analyzer.tv_test.filter", "lowercase"))); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") @@ -278,7 +278,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase { assertAcked(prepareCreate("test").addMapping("type1", mapping) .setSettings(Settings.builder() .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); + .putArray("index.analysis.analyzer.tv_test.filter", "lowercase"))); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") @@ -585,7 +585,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase { .setSettings(Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); + .putArray("index.analysis.analyzer.tv_test.filter", "lowercase"))); ensureGreen(); @@ -645,9 +645,8 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase { assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); - if (withPayloads) { - assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); - } + // We never configure an analyzer with payloads for this test so this is never returned + assertNull("term: " + string, docsAndPositions.getPayload()); } } assertThat(iterator.next(), nullValue()); diff --git a/core/src/test/java/org/elasticsearch/index/termvectors/TermVectorsServiceTests.java b/core/src/test/java/org/elasticsearch/index/termvectors/TermVectorsServiceTests.java index c79a61a22b9..c047235ada4 100644 --- a/core/src/test/java/org/elasticsearch/index/termvectors/TermVectorsServiceTests.java +++ b/core/src/test/java/org/elasticsearch/index/termvectors/TermVectorsServiceTests.java @@ -19,6 +19,9 @@ package org.elasticsearch.index.termvectors; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.termvectors.TermVectorsRequest; import org.elasticsearch.action.termvectors.TermVectorsResponse; import org.elasticsearch.common.settings.Settings; @@ -28,6 +31,7 @@ import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.test.ESSingleNodeTestCase; +import java.io.IOException; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.stream.Stream; @@ -71,4 +75,45 @@ public class TermVectorsServiceTests extends ESSingleNodeTestCase { assertThat(response, notNullValue()); assertThat(response.getTookInMillis(), equalTo(TimeUnit.NANOSECONDS.toMillis(longs.get(1) - longs.get(0)))); } + + public void testDocFreqs() throws IOException { + XContentBuilder mapping = jsonBuilder() + .startObject() + .startObject("doc") + .startObject("properties") + .startObject("text") + .field("type", "text") + .field("term_vector", "with_positions_offsets_payloads") + .endObject() + .endObject() + .endObject() + .endObject(); + Settings settings = Settings.builder() + .put("number_of_shards", 1) + .build(); + createIndex("test", settings, "doc", mapping); + ensureGreen(); + + int max = between(3, 10); + BulkRequestBuilder bulk = client().prepareBulk(); + for (int i = 0; i < max; i++) { + bulk.add(client().prepareIndex("test", "doc", Integer.toString(i)) + .setSource("text", "the quick brown fox jumped over the lazy dog")); + } + bulk.get(); + + TermVectorsRequest request = new TermVectorsRequest("test", "doc", "0").termStatistics(true); + + IndicesService indicesService = getInstanceFromNode(IndicesService.class); + IndexService test = indicesService.indexService(resolveIndex("test")); + IndexShard shard = test.getShardOrNull(0); + assertThat(shard, notNullValue()); + TermVectorsResponse response = TermVectorsService.getTermVectors(shard, request); + + Terms terms = response.getFields().terms("text"); + TermsEnum iterator = terms.iterator(); + while (iterator.next() != null) { + assertEquals(max, iterator.docFreq()); + } + } } diff --git a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index 2572b7aeb0f..298c8938dd2 100644 --- a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -23,11 +23,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.apache.lucene.analysis.hunspell.Dictionary; -import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.store.Directory; @@ -127,12 +124,6 @@ public class AnalysisModuleTests extends ESTestCase { testSimpleConfiguration(settings); } - public void testDefaultFactoryTokenFilters() throws IOException { - assertTokenFilter("keyword_repeat", KeywordRepeatFilter.class); - assertTokenFilter("persian_normalization", PersianNormalizationFilter.class); - assertTokenFilter("arabic_normalization", ArabicNormalizationFilter.class); - } - public void testAnalyzerAliasNotAllowedPost5x() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.foobar.type", "standard") diff --git a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java index d920c6a67b4..316277973ff 100644 --- a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java +++ b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java @@ -1550,30 +1550,6 @@ public class SearchQueryIT extends ESIntegTestCase { assertHitCount(searchResponse, 2); } - public void testMatchQueryWithStackedStems() throws IOException { - CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() - .put(indexSettings()) - .put("index.analysis.analyzer.index.type", "custom") - .put("index.analysis.analyzer.index.tokenizer", "standard") - .put("index.analysis.analyzer.index.filter", "lowercase") - .put("index.analysis.analyzer.search.type", "custom") - .put("index.analysis.analyzer.search.tokenizer", "standard") - .putArray("index.analysis.analyzer.search.filter", "lowercase", "keyword_repeat", "porter_stem", "unique_stem") - .put("index.analysis.filter.unique_stem.type", "unique") - .put("index.analysis.filter.unique_stem.only_on_same_position", true)); - assertAcked(builder.addMapping("test", "text", "type=text,analyzer=index,search_analyzer=search")); - - client().prepareIndex("test", "test", "1").setSource("text", "the fox runs across the street").get(); - refresh(); - SearchResponse searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get(); - assertHitCount(searchResponse, 1); - - client().prepareIndex("test", "test", "2").setSource("text", "run fox run").get(); - refresh(); - searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get(); - assertHitCount(searchResponse, 2); - } - public void testQueryStringWithSynonyms() throws IOException { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() .put(indexSettings()) diff --git a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java index 864f060be03..46a94e641c5 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java @@ -694,107 +694,6 @@ public class SuggestSearchIT extends ESIntegTestCase { assertSuggestion(searchSuggest, 0, "simple_phrase", "xorr the god jewel"); } - public void testPhraseBoundaryCases() throws IOException, URISyntaxException { - CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() - .put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard - .put("index.analysis.analyzer.body.tokenizer", "standard") - .putArray("index.analysis.analyzer.body.filter", "lowercase") - .put("index.analysis.analyzer.bigram.tokenizer", "standard") - .putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase") - .put("index.analysis.analyzer.ngram.tokenizer", "standard") - .putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase") - .put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard") - .putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase") - .put("index.analysis.filter.my_shingle.type", "shingle") - .put("index.analysis.filter.my_shingle.output_unigrams", false) - .put("index.analysis.filter.my_shingle.min_shingle_size", 2) - .put("index.analysis.filter.my_shingle.max_shingle_size", 2) - .put("index.analysis.filter.my_shingle2.type", "shingle") - .put("index.analysis.filter.my_shingle2.output_unigrams", true) - .put("index.analysis.filter.my_shingle2.min_shingle_size", 2) - .put("index.analysis.filter.my_shingle2.max_shingle_size", 2)); - - XContentBuilder mapping = XContentFactory.jsonBuilder() - .startObject().startObject("type1") - .startObject("properties") - .startObject("body").field("type", "text").field("analyzer", "body").endObject() - .startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject() - .startObject("ngram").field("type", "text").field("analyzer", "ngram").endObject() - .endObject() - .endObject().endObject(); - assertAcked(builder.addMapping("type1", mapping)); - ensureGreen(); - - String[] strings = new String[]{ - "Xorr the God-Jewel", - "Grog the God-Crusher", - "Xorn", - "Walter Newell", - "Wanda Maximoff", - "Captain America", - "American Ace", - "Wundarr the Aquarian", - "Will o' the Wisp", - "Xemnu the Titan" - }; - for (String line : strings) { - index("test", "type1", line, "body", line, "bigram", line, "ngram", line); - } - refresh(); - - NumShards numShards = getNumShards("test"); - - // Lets make sure some things throw exceptions - PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram") - .analyzer("body") - .addCandidateGenerator(candidateGenerator("does_not_exist").minWordLength(1).suggestMode("always")) - .realWordErrorLikelihood(0.95f) - .maxErrors(0.5f) - .size(1); - phraseSuggestion.clearCandidateGenerators().analyzer(null); - try { - searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion)); - fail("analyzer does only produce ngrams"); - } catch (SearchPhaseExecutionException e) { - } - - phraseSuggestion.analyzer("bigram"); - try { - searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion)); - fail("analyzer does only produce ngrams"); - } catch (SearchPhaseExecutionException e) { - } - - // Now we'll make sure some things don't - phraseSuggestion.forceUnigrams(false); - searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion)); - - // Field doesn't produce unigrams but the analyzer does - phraseSuggestion.forceUnigrams(true).analyzer("ngram"); - searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion)); - - phraseSuggestion = phraseSuggestion("ngram") - .analyzer("myDefAnalyzer") - .forceUnigrams(true) - .realWordErrorLikelihood(0.95f) - .maxErrors(0.5f) - .size(1) - .addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always")); - Suggest suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion)); - - // "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by - // earlier term (xorn): - assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel"); - - phraseSuggestion.analyzer(null); - suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion)); - - // In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the - // probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the - // others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood - assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel"); - } - public void testDifferentShardSize() throws Exception { createIndex("test"); ensureGreen(); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 1261d15ed65..290b09edc1d 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -21,13 +21,31 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.analysis.ar.ArabicStemFilter; +import org.apache.lucene.analysis.br.BrazilianStemFilter; +import org.apache.lucene.analysis.cjk.CJKBigramFilter; +import org.apache.lucene.analysis.cjk.CJKWidthFilter; +import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.core.DecimalDigitFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.UpperCaseFilter; +import org.apache.lucene.analysis.cz.CzechStemFilter; +import org.apache.lucene.analysis.de.GermanNormalizationFilter; +import org.apache.lucene.analysis.de.GermanStemFilter; import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.fa.PersianNormalizationFilter; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.hi.HindiNormalizationFilter; +import org.apache.lucene.analysis.in.IndicNormalizationFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; +import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; +import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; @@ -35,16 +53,25 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; +import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.ClassicFilter; +import org.apache.lucene.analysis.tr.ApostropheFilter; +import org.apache.lucene.analysis.util.ElisionFilter; import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; +import org.tartarus.snowball.ext.DutchStemmer; +import org.tartarus.snowball.ext.FrenchStemmer; import java.util.ArrayList; import java.util.List; @@ -74,29 +101,61 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public List getPreConfiguredTokenFilters() { List filters = new ArrayList<>(); - filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); + filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> + new DelimitedPayloadTokenFilter(input, + DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, + DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); + filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + filters.add(PreConfiguredTokenFilter.singleton("elision", true, + input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))); + filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); + filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless + filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> + new LimitTokenCountFilter(input, + LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, + LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS))); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); + filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("shingle", false, ShingleFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); + filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); + filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input))); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index cf78f6646a2..d2505406457 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -20,6 +20,8 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.en.PorterStemFilterFactory; +import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory; +import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; @@ -68,22 +70,46 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { @Override protected Map> getPreConfiguredTokenFilters() { Map> filters = new TreeMap<>(super.getPreConfiguredTokenFilters()); + filters.put("apostrophe", null); + filters.put("arabic_normalization", null); + filters.put("arabic_stem", null); filters.put("asciifolding", null); + filters.put("brazilian_stem", null); + filters.put("cjk_bigram", null); + filters.put("cjk_width", null); filters.put("classic", null); filters.put("common_grams", null); + filters.put("czech_stem", null); + filters.put("decimal_digit", null); + filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class); + filters.put("dutch_stem", SnowballPorterFilterFactory.class); filters.put("edge_ngram", null); filters.put("edgeNGram", null); + filters.put("elision", null); + filters.put("french_stem", SnowballPorterFilterFactory.class); + filters.put("german_stem", null); + filters.put("hindi_normalization", null); + filters.put("indic_normalization", null); + filters.put("keyword_repeat", null); filters.put("kstem", null); filters.put("length", null); + filters.put("limit", LimitTokenCountFilterFactory.class); filters.put("ngram", null); filters.put("nGram", null); + filters.put("persian_normalization", null); filters.put("porter_stem", null); filters.put("reverse", ReverseStringFilterFactory.class); + filters.put("russian_stem", SnowballPorterFilterFactory.class); + filters.put("scandinavian_normalization", null); + filters.put("scandinavian_folding", null); + filters.put("shingle", null); filters.put("snowball", SnowballPorterFilterFactory.class); + filters.put("sorani_normalization", null); filters.put("stemmer", PorterStemFilterFactory.class); filters.put("stop", null); filters.put("trim", null); filters.put("truncate", null); + filters.put("type_as_payload", null); filters.put("unique", Void.class); filters.put("uppercase", null); filters.put("word_delimiter", null); diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/10_match.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/10_match.yml new file mode 100644 index 00000000000..d07e06865a1 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/10_match.yml @@ -0,0 +1,65 @@ +# integration tests for queries with specific analysis chains + +"match query with stacked stems": + # Tests the match query stemmed tokens are "stacked" on top of the unstemmed + # versions in the same position. + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + number_of_replicas: 1 + analysis: + analyzer: + index: + tokenizer: standard + filter: [lowercase] + search: + tokenizer: standard + filter: [lowercase, keyword_repeat, porter_stem, unique_stem] + filter: + unique_stem: + type: unique + only_on_same_position: true + mappings: + doc: + properties: + body: + type: text + analyzer: index + search_analyzer: search + + - do: + index: + index: test + type: doc + id: 1 + body: { "text": "the fox runs across the street" } + refresh: true + + - do: + search: + body: + query: + match: + text: fox runs + operator: AND + - match: {hits.count: 1} + + - do: + index: + index: test + type: doc + id: 2 + body: { "text": "run fox run" } + refresh: true + + - do: + search: + body: + query: + match: + text: fox runs + operator: AND + - match: {hits.count: 2} diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yaml new file mode 100644 index 00000000000..cf5ebcea42e --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yaml @@ -0,0 +1,158 @@ +# Integration tests for the phrase suggester with a few analyzers + +setup: + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + number_of_replicas: 1 + analysis: + analyzer: + body: + tokenizer: standard + filter: [lowercase] + bigram: + tokenizer: standard + filter: [lowercase, bigram] + ngram: + tokenizer: standard + filter: [lowercase, ngram] + filter: + bigram: + type: shingle + output_unigrams: false + min_shingle_size: 2 + max_shingle_size: 2 + ngram: + type: shingle + output_unigrams: true + min_shingle_size: 2 + max_shingle_size: 2 + mappings: + doc: + properties: + body: + type: text + analyzer: body + fields: + bigram: + type: text + analyzer: bigram + ngram: + type: text + analyzer: ngram + + - do: + bulk: + index: test + type: doc + refresh: true + body: | + { "index": {} } + { "body": "Xorr the God-Jewel" } + { "index": {} } + { "body": "Xorn" } + +--- +"sorts by score": + - do: + search: + size: 0 + index: test + body: + suggest: + text: xor the got-jewel + test: + phrase: + field: body.ngram + force_unigrams: true + max_errors: 0.5 + direct_generator: + - field: body.ngram + min_word_length: 1 + suggest_mode: always + + - match: {suggest.test.0.options.0.text: xorr the god jewel} + - match: {suggest.test.0.options.1.text: xorn the god jewel} + +--- +"breaks ties by sorting terms": + # This runs the suggester without bigrams so we can be sure of the sort order + - do: + search: + size: 0 + index: test + body: + suggest: + text: xor the got-jewel + test: + phrase: + field: body + analyzer: body + force_unigrams: true + max_errors: 0.5 + direct_generator: + - field: body + min_word_length: 1 + suggest_mode: always + + # The scores are identical but xorn comes first because it sorts first + - match: {suggest.test.0.options.0.text: xorn the god jewel} + - match: {suggest.test.0.options.1.text: xorr the god jewel} + - match: {suggest.test.0.options.0.score: $body.suggest.test.0.options.0.score} + +--- +"fails when asked to run on a field without unigrams": + - do: + catch: /since it doesn't emit unigrams/ + search: + size: 0 + index: test + body: + suggest: + text: xor the got-jewel + test: + phrase: + field: body.bigram + + - do: + catch: /since it doesn't emit unigrams/ + search: + size: 0 + index: test + body: + suggest: + text: xor the got-jewel + test: + phrase: + field: body.bigram + analyzer: bigram + +--- +"doesn't fail when asked to run on a field without unigrams when force_unigrams=false": + - do: + search: + size: 0 + index: test + body: + suggest: + text: xor the got-jewel + test: + phrase: + field: body.bigram + force_unigrams: false + + - do: + search: + size: 0 + index: test + body: + suggest: + text: xor the got-jewel + test: + phrase: + field: body.bigram + analyzer: bigram + force_unigrams: false diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/termvectors/10_payloads.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/termvectors/10_payloads.yml new file mode 100644 index 00000000000..d0e31758340 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/termvectors/10_payloads.yml @@ -0,0 +1,40 @@ +"term vectors with payloads tests": + # Tests term vectors with payloads. This is in the analysis-common module + # because there are no token filters that support payloads in core. + - do: + indices.create: + index: test + body: + mappings: + doc: + properties: + text: + type: text + term_vector: with_positions_offsets_payloads + analyzer: has_payloads + settings: + number_of_shards: 1 + number_of_replicas: 1 + analysis: + analyzer: + had_payloads: + tokenizer: standard + filter: [type_as_payload] + + - do: + index: + index: test + type: doc + id: 1 + refresh: true + body: + text: The quick brown fox is brown. + + - do: + termvectors: + index: test + type: doc + id: 1 + payloads: true + - match: {term_vectors.text.field_statistics.sum_doc_freq: 5} + - match: {term_vectors.text.terms.brown.tokens.0.payload: 10} diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index fe22734d974..040d2fb2dc6 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -19,8 +19,6 @@ package org.elasticsearch.indices.analysis; -import org.apache.lucene.analysis.en.PorterStemFilterFactory; -import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; @@ -97,7 +95,6 @@ import java.util.Collection; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; -import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Set; @@ -343,29 +340,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { Map> filters = new HashMap<>(); filters.put("standard", null); filters.put("lowercase", null); - // TODO remove the loop below once all the tokenizers are migrated out of PreBuiltTokenFilters - for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { - Class luceneFactoryClass; - switch (tokenizer) { - case LOWERCASE: - // This has been migrated but has to stick around until PreBuiltTokenizers is removed. - continue; - case DUTCH_STEM: - case FRENCH_STEM: - case RUSSIAN_STEM: - luceneFactoryClass = SnowballPorterFilterFactory.class; - break; - case DELIMITED_PAYLOAD_FILTER: - luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class; - break; - case LIMIT: - luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class; - break; - default: - luceneFactoryClass = null; - } - filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass); - } return filters; }