Move remaining pre-configured token filters into analysis-common (#24716)

Moves the remaining preconfigured token figured into the analysis-common module. There were a couple of tests in core that depended on the pre-configured token filters so I had to touch them:

* `GetTermVectorsCheckDocFreqIT` depended on `type_as_payload` but didn't do anything important with it. I dropped the dependency. Then I moved the test to a single node test case because we're trying to cut down on the number of `ESIntegTestCase` subclasses.
* `AbstractTermVectorsTestCase` and its subclasses depended on `type_as_payload`. I dropped their usage of the token filter and added an integration test for the termvectors API that uses `type_as_payload` to the `analysis-common` module.
* `AnalysisModuleTests` expected a few pre-configured token filtes be registered by default. They aren't any more so I dropped this assertion. We assert that the `CommonAnalysisPlugin` registers these pre-built token filters in `CommonAnalysisFactoryTests`
* `SearchQueryIT` and `SuggestSearchIT` had tests that depended on the specific behavior of the token filters so I moved the tests to integration tests in `analysis-common`.
This commit is contained in:
Nik Everett 2017-05-16 13:10:24 -04:00 committed by GitHub
parent a00165913b
commit 7ef390068a
15 changed files with 402 additions and 700 deletions

View File

@ -279,22 +279,6 @@ public final class AnalysisModule {
* lucene-analyzers-common so "stop" is defined in the analysis-common
* module. */
// Add token filters declared in PreBuiltTokenFilters until they have all been migrated
for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) {
switch (preBuilt) {
case LOWERCASE:
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
continue;
default:
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
}
String name = preBuilt.name().toLowerCase(Locale.ROOT);
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
}
}
for (AnalysisPlugin plugin: plugins) {
for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
preConfiguredTokenFilters.register(filter.getName(), filter);

View File

@ -20,38 +20,10 @@ package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.br.BrazilianStemFilter;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.Version;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import java.util.Locale;
@ -66,229 +38,7 @@ public enum PreBuiltTokenFilters {
protected boolean isMultiTermAware() {
return true;
}
},
// Extended Token Filters
ELISION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
ARABIC_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ArabicStemFilter(tokenStream);
}
},
BRAZILIAN_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new BrazilianStemFilter(tokenStream);
}
},
CZECH_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CzechStemFilter(tokenStream);
}
},
DUTCH_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, new DutchStemmer());
}
},
FRENCH_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, new FrenchStemmer());
}
},
GERMAN_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new GermanStemFilter(tokenStream);
}
},
RUSSIAN_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, "Russian");
}
},
KEYWORD_REPEAT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new KeywordRepeatFilter(tokenStream);
}
},
ARABIC_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ArabicNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new PersianNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new TypeAsPayloadTokenFilter(tokenStream);
}
},
SHINGLE(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ShingleFilter(tokenStream);
}
},
GERMAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new GermanNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
HINDI_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new HindiNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
INDIC_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new IndicNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
SORANI_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SoraniNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianFoldingFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
APOSTROPHE(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ApostropheFilter(tokenStream);
}
},
CJK_WIDTH(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKWidthFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
DECIMAL_DIGIT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new DecimalDigitFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
CJK_BIGRAM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKBigramFilter(tokenStream);
}
},
DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
}
},
LIMIT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS);
}
},
;
};
protected boolean isMultiTermAware() {
return false;

View File

@ -66,7 +66,6 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
import static org.hamcrest.Matchers.equalTo;
public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
protected static class TestFieldSetting {
public final String name;
public final boolean storedOffset;
@ -211,7 +210,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase");
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase");
assertAcked(prepareCreate(index).addMapping("type1", mappingBuilder).setSettings(settings).addAlias(new Alias(alias)));
}
@ -395,11 +394,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
}
if (field.storedPayloads && testConfig.requestPayloads) {
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
}
assertNull("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload());
}
}
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());

View File

@ -1,259 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.action.termvectors;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.test.ESIntegTestCase;
import org.hamcrest.Matchers;
import java.io.IOException;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.hamcrest.Matchers.equalTo;
public class GetTermVectorsCheckDocFreqIT extends ESIntegTestCase {
@Override
protected int numberOfShards() {
return 1;
}
@Override
protected int numberOfReplicas() {
return 0;
}
@Override
public Settings indexSettings() {
return Settings.builder()
.put(super.indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")
.build();
}
public void testSimpleTermVectors() throws IOException {
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("field")
.field("type", "text")
.field("term_vector", "with_positions_offsets_payloads")
.field("analyzer", "tv_test")
.endObject()
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping));
ensureGreen();
int numDocs = 15;
for (int i = 0; i < numDocs; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
// 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
// 31the34 35lazy39 40dog43
.endObject()).execute().actionGet();
refresh();
}
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
for (int i = 0; i < numDocs; i++) {
checkAllInfo(numDocs, values, freq, pos, startOffset, endOffset, i);
checkWithoutTermStatistics(numDocs, values, freq, pos, startOffset, endOffset, i);
checkWithoutFieldStatistics(numDocs, values, freq, pos, startOffset, endOffset, i);
}
}
private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset,
int i) throws IOException {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
.setPositions(true).setTermStatistics(true).setFieldStatistics(false).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
assertThat(terms.getDocCount(), Matchers.equalTo(-1));
assertThat(terms.getSumDocFreq(), equalTo((long) -1));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, Matchers.notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, Matchers.notNullValue());
if (string.equals("the")) {
assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
} else {
assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
}
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
assertThat(iterator.docFreq(), equalTo(numDocs));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
assertThat(iterator.next(), Matchers.nullValue());
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
response.toXContent(xBuilder, null);
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
+ i
+ "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
assertThat(utf8, equalTo(expectedString));
}
private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset,
int i) throws IOException {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
.setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields();
assertThat(resp.request().termStatistics(), equalTo(false));
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, Matchers.notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, Matchers.notNullValue());
assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
assertThat(iterator.docFreq(), equalTo(-1));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
assertThat(iterator.next(), Matchers.nullValue());
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
response.toXContent(xBuilder, null);
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
+ i
+ "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
assertThat(utf8, equalTo(expectedString));
}
private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i)
throws IOException {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
.setPositions(true).setFieldStatistics(true).setTermStatistics(true).setSelectedFields();
assertThat(resp.request().fieldStatistics(), equalTo(true));
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, Matchers.notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, Matchers.notNullValue());
if (string.equals("the")) {
assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
} else {
assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
}
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
assertThat(iterator.docFreq(), equalTo(numDocs));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
assertThat(iterator.next(), Matchers.nullValue());
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
+ i
+ "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
assertThat(utf8, equalTo(expectedString));
}
}

View File

@ -193,7 +193,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
.setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
@ -278,7 +278,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
assertAcked(prepareCreate("test").addMapping("type1", mapping)
.setSettings(Settings.builder()
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
@ -585,7 +585,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
.setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
ensureGreen();
@ -645,9 +645,8 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
if (withPayloads) {
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
// We never configure an analyzer with payloads for this test so this is never returned
assertNull("term: " + string, docsAndPositions.getPayload());
}
}
assertThat(iterator.next(), nullValue());

View File

@ -19,6 +19,9 @@
package org.elasticsearch.index.termvectors;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.termvectors.TermVectorsRequest;
import org.elasticsearch.action.termvectors.TermVectorsResponse;
import org.elasticsearch.common.settings.Settings;
@ -28,6 +31,7 @@ import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.test.ESSingleNodeTestCase;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
@ -71,4 +75,45 @@ public class TermVectorsServiceTests extends ESSingleNodeTestCase {
assertThat(response, notNullValue());
assertThat(response.getTookInMillis(), equalTo(TimeUnit.NANOSECONDS.toMillis(longs.get(1) - longs.get(0))));
}
public void testDocFreqs() throws IOException {
XContentBuilder mapping = jsonBuilder()
.startObject()
.startObject("doc")
.startObject("properties")
.startObject("text")
.field("type", "text")
.field("term_vector", "with_positions_offsets_payloads")
.endObject()
.endObject()
.endObject()
.endObject();
Settings settings = Settings.builder()
.put("number_of_shards", 1)
.build();
createIndex("test", settings, "doc", mapping);
ensureGreen();
int max = between(3, 10);
BulkRequestBuilder bulk = client().prepareBulk();
for (int i = 0; i < max; i++) {
bulk.add(client().prepareIndex("test", "doc", Integer.toString(i))
.setSource("text", "the quick brown fox jumped over the lazy dog"));
}
bulk.get();
TermVectorsRequest request = new TermVectorsRequest("test", "doc", "0").termStatistics(true);
IndicesService indicesService = getInstanceFromNode(IndicesService.class);
IndexService test = indicesService.indexService(resolveIndex("test"));
IndexShard shard = test.getShardOrNull(0);
assertThat(shard, notNullValue());
TermVectorsResponse response = TermVectorsService.getTermVectors(shard, request);
Terms terms = response.getFields().terms("text");
TermsEnum iterator = terms.iterator();
while (iterator.next() != null) {
assertEquals(max, iterator.docFreq());
}
}
}

View File

@ -23,11 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.store.Directory;
@ -127,12 +124,6 @@ public class AnalysisModuleTests extends ESTestCase {
testSimpleConfiguration(settings);
}
public void testDefaultFactoryTokenFilters() throws IOException {
assertTokenFilter("keyword_repeat", KeywordRepeatFilter.class);
assertTokenFilter("persian_normalization", PersianNormalizationFilter.class);
assertTokenFilter("arabic_normalization", ArabicNormalizationFilter.class);
}
public void testAnalyzerAliasNotAllowedPost5x() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.foobar.type", "standard")

View File

@ -1550,30 +1550,6 @@ public class SearchQueryIT extends ESIntegTestCase {
assertHitCount(searchResponse, 2);
}
public void testMatchQueryWithStackedStems() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.index.type", "custom")
.put("index.analysis.analyzer.index.tokenizer", "standard")
.put("index.analysis.analyzer.index.filter", "lowercase")
.put("index.analysis.analyzer.search.type", "custom")
.put("index.analysis.analyzer.search.tokenizer", "standard")
.putArray("index.analysis.analyzer.search.filter", "lowercase", "keyword_repeat", "porter_stem", "unique_stem")
.put("index.analysis.filter.unique_stem.type", "unique")
.put("index.analysis.filter.unique_stem.only_on_same_position", true));
assertAcked(builder.addMapping("test", "text", "type=text,analyzer=index,search_analyzer=search"));
client().prepareIndex("test", "test", "1").setSource("text", "the fox runs across the street").get();
refresh();
SearchResponse searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get();
assertHitCount(searchResponse, 1);
client().prepareIndex("test", "test", "2").setSource("text", "run fox run").get();
refresh();
searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get();
assertHitCount(searchResponse, 2);
}
public void testQueryStringWithSynonyms() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())

View File

@ -694,107 +694,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
assertSuggestion(searchSuggest, 0, "simple_phrase", "xorr the god jewel");
}
public void testPhraseBoundaryCases() throws IOException, URISyntaxException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
.putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase")
.put("index.analysis.analyzer.ngram.tokenizer", "standard")
.putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase")
.put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard")
.putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase")
.put("index.analysis.filter.my_shingle.type", "shingle")
.put("index.analysis.filter.my_shingle.output_unigrams", false)
.put("index.analysis.filter.my_shingle.min_shingle_size", 2)
.put("index.analysis.filter.my_shingle.max_shingle_size", 2)
.put("index.analysis.filter.my_shingle2.type", "shingle")
.put("index.analysis.filter.my_shingle2.output_unigrams", true)
.put("index.analysis.filter.my_shingle2.min_shingle_size", 2)
.put("index.analysis.filter.my_shingle2.max_shingle_size", 2));
XContentBuilder mapping = XContentFactory.jsonBuilder()
.startObject().startObject("type1")
.startObject("properties")
.startObject("body").field("type", "text").field("analyzer", "body").endObject()
.startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
.startObject("ngram").field("type", "text").field("analyzer", "ngram").endObject()
.endObject()
.endObject().endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();
String[] strings = new String[]{
"Xorr the God-Jewel",
"Grog the God-Crusher",
"Xorn",
"Walter Newell",
"Wanda Maximoff",
"Captain America",
"American Ace",
"Wundarr the Aquarian",
"Will o' the Wisp",
"Xemnu the Titan"
};
for (String line : strings) {
index("test", "type1", line, "body", line, "bigram", line, "ngram", line);
}
refresh();
NumShards numShards = getNumShards("test");
// Lets make sure some things throw exceptions
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")
.analyzer("body")
.addCandidateGenerator(candidateGenerator("does_not_exist").minWordLength(1).suggestMode("always"))
.realWordErrorLikelihood(0.95f)
.maxErrors(0.5f)
.size(1);
phraseSuggestion.clearCandidateGenerators().analyzer(null);
try {
searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
fail("analyzer does only produce ngrams");
} catch (SearchPhaseExecutionException e) {
}
phraseSuggestion.analyzer("bigram");
try {
searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
fail("analyzer does only produce ngrams");
} catch (SearchPhaseExecutionException e) {
}
// Now we'll make sure some things don't
phraseSuggestion.forceUnigrams(false);
searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
// Field doesn't produce unigrams but the analyzer does
phraseSuggestion.forceUnigrams(true).analyzer("ngram");
searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
phraseSuggestion = phraseSuggestion("ngram")
.analyzer("myDefAnalyzer")
.forceUnigrams(true)
.realWordErrorLikelihood(0.95f)
.maxErrors(0.5f)
.size(1)
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
Suggest suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
// earlier term (xorn):
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
phraseSuggestion.analyzer(null);
suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
}
public void testDifferentShardSize() throws Exception {
createIndex("test");
ensureGreen();

View File

@ -21,13 +21,31 @@ package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.br.BrazilianStemFilter;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
@ -35,16 +53,25 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import java.util.ArrayList;
import java.util.List;
@ -74,29 +101,61 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
List<PreConfiguredTokenFilter> filters = new ArrayList<>();
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input)));
filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
new DelimitedPayloadTokenFilter(input,
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
// TODO deprecate edgeNGram
filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input ->
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
filters.add(PreConfiguredTokenFilter.singleton("elision", true,
input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
filters.add(PreConfiguredTokenFilter.singleton("limit", false, input ->
new LimitTokenCountFilter(input,
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
// TODO deprecate nGram
filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input)));
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, ShingleFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input)));
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->

View File

@ -20,6 +20,8 @@
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
@ -68,22 +70,46 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
@Override
protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
filters.put("apostrophe", null);
filters.put("arabic_normalization", null);
filters.put("arabic_stem", null);
filters.put("asciifolding", null);
filters.put("brazilian_stem", null);
filters.put("cjk_bigram", null);
filters.put("cjk_width", null);
filters.put("classic", null);
filters.put("common_grams", null);
filters.put("czech_stem", null);
filters.put("decimal_digit", null);
filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
filters.put("dutch_stem", SnowballPorterFilterFactory.class);
filters.put("edge_ngram", null);
filters.put("edgeNGram", null);
filters.put("elision", null);
filters.put("french_stem", SnowballPorterFilterFactory.class);
filters.put("german_stem", null);
filters.put("hindi_normalization", null);
filters.put("indic_normalization", null);
filters.put("keyword_repeat", null);
filters.put("kstem", null);
filters.put("length", null);
filters.put("limit", LimitTokenCountFilterFactory.class);
filters.put("ngram", null);
filters.put("nGram", null);
filters.put("persian_normalization", null);
filters.put("porter_stem", null);
filters.put("reverse", ReverseStringFilterFactory.class);
filters.put("russian_stem", SnowballPorterFilterFactory.class);
filters.put("scandinavian_normalization", null);
filters.put("scandinavian_folding", null);
filters.put("shingle", null);
filters.put("snowball", SnowballPorterFilterFactory.class);
filters.put("sorani_normalization", null);
filters.put("stemmer", PorterStemFilterFactory.class);
filters.put("stop", null);
filters.put("trim", null);
filters.put("truncate", null);
filters.put("type_as_payload", null);
filters.put("unique", Void.class);
filters.put("uppercase", null);
filters.put("word_delimiter", null);

View File

@ -0,0 +1,65 @@
# integration tests for queries with specific analysis chains
"match query with stacked stems":
# Tests the match query stemmed tokens are "stacked" on top of the unstemmed
# versions in the same position.
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 1
analysis:
analyzer:
index:
tokenizer: standard
filter: [lowercase]
search:
tokenizer: standard
filter: [lowercase, keyword_repeat, porter_stem, unique_stem]
filter:
unique_stem:
type: unique
only_on_same_position: true
mappings:
doc:
properties:
body:
type: text
analyzer: index
search_analyzer: search
- do:
index:
index: test
type: doc
id: 1
body: { "text": "the fox runs across the street" }
refresh: true
- do:
search:
body:
query:
match:
text: fox runs
operator: AND
- match: {hits.count: 1}
- do:
index:
index: test
type: doc
id: 2
body: { "text": "run fox run" }
refresh: true
- do:
search:
body:
query:
match:
text: fox runs
operator: AND
- match: {hits.count: 2}

View File

@ -0,0 +1,158 @@
# Integration tests for the phrase suggester with a few analyzers
setup:
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 1
analysis:
analyzer:
body:
tokenizer: standard
filter: [lowercase]
bigram:
tokenizer: standard
filter: [lowercase, bigram]
ngram:
tokenizer: standard
filter: [lowercase, ngram]
filter:
bigram:
type: shingle
output_unigrams: false
min_shingle_size: 2
max_shingle_size: 2
ngram:
type: shingle
output_unigrams: true
min_shingle_size: 2
max_shingle_size: 2
mappings:
doc:
properties:
body:
type: text
analyzer: body
fields:
bigram:
type: text
analyzer: bigram
ngram:
type: text
analyzer: ngram
- do:
bulk:
index: test
type: doc
refresh: true
body: |
{ "index": {} }
{ "body": "Xorr the God-Jewel" }
{ "index": {} }
{ "body": "Xorn" }
---
"sorts by score":
- do:
search:
size: 0
index: test
body:
suggest:
text: xor the got-jewel
test:
phrase:
field: body.ngram
force_unigrams: true
max_errors: 0.5
direct_generator:
- field: body.ngram
min_word_length: 1
suggest_mode: always
- match: {suggest.test.0.options.0.text: xorr the god jewel}
- match: {suggest.test.0.options.1.text: xorn the god jewel}
---
"breaks ties by sorting terms":
# This runs the suggester without bigrams so we can be sure of the sort order
- do:
search:
size: 0
index: test
body:
suggest:
text: xor the got-jewel
test:
phrase:
field: body
analyzer: body
force_unigrams: true
max_errors: 0.5
direct_generator:
- field: body
min_word_length: 1
suggest_mode: always
# The scores are identical but xorn comes first because it sorts first
- match: {suggest.test.0.options.0.text: xorn the god jewel}
- match: {suggest.test.0.options.1.text: xorr the god jewel}
- match: {suggest.test.0.options.0.score: $body.suggest.test.0.options.0.score}
---
"fails when asked to run on a field without unigrams":
- do:
catch: /since it doesn't emit unigrams/
search:
size: 0
index: test
body:
suggest:
text: xor the got-jewel
test:
phrase:
field: body.bigram
- do:
catch: /since it doesn't emit unigrams/
search:
size: 0
index: test
body:
suggest:
text: xor the got-jewel
test:
phrase:
field: body.bigram
analyzer: bigram
---
"doesn't fail when asked to run on a field without unigrams when force_unigrams=false":
- do:
search:
size: 0
index: test
body:
suggest:
text: xor the got-jewel
test:
phrase:
field: body.bigram
force_unigrams: false
- do:
search:
size: 0
index: test
body:
suggest:
text: xor the got-jewel
test:
phrase:
field: body.bigram
analyzer: bigram
force_unigrams: false

View File

@ -0,0 +1,40 @@
"term vectors with payloads tests":
# Tests term vectors with payloads. This is in the analysis-common module
# because there are no token filters that support payloads in core.
- do:
indices.create:
index: test
body:
mappings:
doc:
properties:
text:
type: text
term_vector: with_positions_offsets_payloads
analyzer: has_payloads
settings:
number_of_shards: 1
number_of_replicas: 1
analysis:
analyzer:
had_payloads:
tokenizer: standard
filter: [type_as_payload]
- do:
index:
index: test
type: doc
id: 1
refresh: true
body:
text: The quick brown fox is brown.
- do:
termvectors:
index: test
type: doc
id: 1
payloads: true
- match: {term_vectors.text.field_statistics.sum_doc_freq: 5}
- match: {term_vectors.text.terms.brown.tokens.0.payload: 10}

View File

@ -19,8 +19,6 @@
package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
@ -97,7 +95,6 @@ import java.util.Collection;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
@ -343,29 +340,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
Map<String, Class<?>> filters = new HashMap<>();
filters.put("standard", null);
filters.put("lowercase", null);
// TODO remove the loop below once all the tokenizers are migrated out of PreBuiltTokenFilters
for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) {
Class<?> luceneFactoryClass;
switch (tokenizer) {
case LOWERCASE:
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
continue;
case DUTCH_STEM:
case FRENCH_STEM:
case RUSSIAN_STEM:
luceneFactoryClass = SnowballPorterFilterFactory.class;
break;
case DELIMITED_PAYLOAD_FILTER:
luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class;
break;
case LIMIT:
luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class;
break;
default:
luceneFactoryClass = null;
}
filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass);
}
return filters;
}