Move remaining pre-configured token filters into analysis-common (#24716)
Moves the remaining preconfigured token figured into the analysis-common module. There were a couple of tests in core that depended on the pre-configured token filters so I had to touch them: * `GetTermVectorsCheckDocFreqIT` depended on `type_as_payload` but didn't do anything important with it. I dropped the dependency. Then I moved the test to a single node test case because we're trying to cut down on the number of `ESIntegTestCase` subclasses. * `AbstractTermVectorsTestCase` and its subclasses depended on `type_as_payload`. I dropped their usage of the token filter and added an integration test for the termvectors API that uses `type_as_payload` to the `analysis-common` module. * `AnalysisModuleTests` expected a few pre-configured token filtes be registered by default. They aren't any more so I dropped this assertion. We assert that the `CommonAnalysisPlugin` registers these pre-built token filters in `CommonAnalysisFactoryTests` * `SearchQueryIT` and `SuggestSearchIT` had tests that depended on the specific behavior of the token filters so I moved the tests to integration tests in `analysis-common`.
This commit is contained in:
parent
a00165913b
commit
7ef390068a
|
@ -278,22 +278,6 @@ public final class AnalysisModule {
|
|||
* version uses a set of English stop words that are in
|
||||
* lucene-analyzers-common so "stop" is defined in the analysis-common
|
||||
* module. */
|
||||
|
||||
// Add token filters declared in PreBuiltTokenFilters until they have all been migrated
|
||||
for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) {
|
||||
switch (preBuilt) {
|
||||
case LOWERCASE:
|
||||
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
|
||||
continue;
|
||||
default:
|
||||
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
|
||||
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
|
||||
}
|
||||
String name = preBuilt.name().toLowerCase(Locale.ROOT);
|
||||
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
|
||||
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
|
||||
}
|
||||
}
|
||||
|
||||
for (AnalysisPlugin plugin: plugins) {
|
||||
for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
|
||||
|
|
|
@ -20,38 +20,10 @@ package org.elasticsearch.indices.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
|
||||
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.tr.ApostropheFilter;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||
import org.tartarus.snowball.ext.DutchStemmer;
|
||||
import org.tartarus.snowball.ext.FrenchStemmer;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
|
@ -66,229 +38,7 @@ public enum PreBuiltTokenFilters {
|
|||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
// Extended Token Filters
|
||||
ELISION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
ARABIC_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ArabicStemFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
BRAZILIAN_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new BrazilianStemFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
CZECH_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new CzechStemFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
DUTCH_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new SnowballFilter(tokenStream, new DutchStemmer());
|
||||
}
|
||||
},
|
||||
|
||||
FRENCH_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new SnowballFilter(tokenStream, new FrenchStemmer());
|
||||
}
|
||||
},
|
||||
|
||||
GERMAN_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new GermanStemFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
RUSSIAN_STEM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new SnowballFilter(tokenStream, "Russian");
|
||||
}
|
||||
},
|
||||
|
||||
KEYWORD_REPEAT(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new KeywordRepeatFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
ARABIC_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ArabicNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new PersianNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new TypeAsPayloadTokenFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
SHINGLE(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ShingleFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
GERMAN_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new GermanNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
HINDI_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new HindiNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
INDIC_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new IndicNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
SORANI_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new SoraniNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ScandinavianNormalizationFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ScandinavianFoldingFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
APOSTROPHE(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new ApostropheFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
CJK_WIDTH(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new CJKWidthFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
DECIMAL_DIGIT(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new DecimalDigitFilter(tokenStream);
|
||||
}
|
||||
@Override
|
||||
protected boolean isMultiTermAware() {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
|
||||
CJK_BIGRAM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new CJKBigramFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
|
||||
}
|
||||
},
|
||||
|
||||
LIMIT(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS);
|
||||
}
|
||||
},
|
||||
|
||||
;
|
||||
};
|
||||
|
||||
protected boolean isMultiTermAware() {
|
||||
return false;
|
||||
|
|
|
@ -66,7 +66,6 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
|
|||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
|
||||
|
||||
protected static class TestFieldSetting {
|
||||
public final String name;
|
||||
public final boolean storedOffset;
|
||||
|
@ -211,7 +210,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
|
|||
Settings.Builder settings = Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase");
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase");
|
||||
assertAcked(prepareCreate(index).addMapping("type1", mappingBuilder).setSettings(settings).addAlias(new Alias(alias)));
|
||||
}
|
||||
|
||||
|
@ -395,11 +394,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
|
|||
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
|
||||
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
|
||||
}
|
||||
if (field.storedPayloads && testConfig.requestPayloads) {
|
||||
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
|
||||
} else {
|
||||
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
|
||||
}
|
||||
assertNull("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload());
|
||||
}
|
||||
}
|
||||
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
|
||||
|
|
|
@ -1,259 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.action.termvectors;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.ToXContent;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.test.ESIntegTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
public class GetTermVectorsCheckDocFreqIT extends ESIntegTestCase {
|
||||
|
||||
@Override
|
||||
protected int numberOfShards() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numberOfReplicas() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Settings indexSettings() {
|
||||
return Settings.builder()
|
||||
.put(super.indexSettings())
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")
|
||||
.build();
|
||||
}
|
||||
|
||||
public void testSimpleTermVectors() throws IOException {
|
||||
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
|
||||
.startObject("properties")
|
||||
.startObject("field")
|
||||
.field("type", "text")
|
||||
.field("term_vector", "with_positions_offsets_payloads")
|
||||
.field("analyzer", "tv_test")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
assertAcked(prepareCreate("test").addMapping("type1", mapping));
|
||||
ensureGreen();
|
||||
int numDocs = 15;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
client().prepareIndex("test", "type1", Integer.toString(i))
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
|
||||
// 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
|
||||
// 31the34 35lazy39 40dog43
|
||||
.endObject()).execute().actionGet();
|
||||
refresh();
|
||||
}
|
||||
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
|
||||
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
|
||||
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
|
||||
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
|
||||
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
checkAllInfo(numDocs, values, freq, pos, startOffset, endOffset, i);
|
||||
checkWithoutTermStatistics(numDocs, values, freq, pos, startOffset, endOffset, i);
|
||||
checkWithoutFieldStatistics(numDocs, values, freq, pos, startOffset, endOffset, i);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset,
|
||||
int i) throws IOException {
|
||||
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
|
||||
.setPositions(true).setTermStatistics(true).setFieldStatistics(false).setSelectedFields();
|
||||
TermVectorsResponse response = resp.execute().actionGet();
|
||||
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
|
||||
Fields fields = response.getFields();
|
||||
assertThat(fields.size(), equalTo(1));
|
||||
Terms terms = fields.terms("field");
|
||||
assertThat(terms.size(), equalTo(8L));
|
||||
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
|
||||
assertThat(terms.getDocCount(), Matchers.equalTo(-1));
|
||||
assertThat(terms.getSumDocFreq(), equalTo((long) -1));
|
||||
TermsEnum iterator = terms.iterator();
|
||||
for (int j = 0; j < values.length; j++) {
|
||||
String string = values[j];
|
||||
BytesRef next = iterator.next();
|
||||
assertThat(next, Matchers.notNullValue());
|
||||
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
|
||||
assertThat(next, Matchers.notNullValue());
|
||||
if (string.equals("the")) {
|
||||
assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
|
||||
} else {
|
||||
assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
|
||||
}
|
||||
|
||||
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
|
||||
assertThat(docsAndPositions.nextDoc(), equalTo(0));
|
||||
assertThat(freq[j], equalTo(docsAndPositions.freq()));
|
||||
assertThat(iterator.docFreq(), equalTo(numDocs));
|
||||
int[] termPos = pos[j];
|
||||
int[] termStartOffset = startOffset[j];
|
||||
int[] termEndOffset = endOffset[j];
|
||||
assertThat(termPos.length, equalTo(freq[j]));
|
||||
assertThat(termStartOffset.length, equalTo(freq[j]));
|
||||
assertThat(termEndOffset.length, equalTo(freq[j]));
|
||||
for (int k = 0; k < freq[j]; k++) {
|
||||
int nextPosition = docsAndPositions.nextPosition();
|
||||
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
|
||||
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
|
||||
}
|
||||
}
|
||||
assertThat(iterator.next(), Matchers.nullValue());
|
||||
|
||||
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
|
||||
response.toXContent(xBuilder, null);
|
||||
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
|
||||
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
|
||||
+ i
|
||||
+ "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
|
||||
assertThat(utf8, equalTo(expectedString));
|
||||
|
||||
}
|
||||
|
||||
private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset,
|
||||
int i) throws IOException {
|
||||
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
|
||||
.setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields();
|
||||
assertThat(resp.request().termStatistics(), equalTo(false));
|
||||
TermVectorsResponse response = resp.execute().actionGet();
|
||||
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
|
||||
Fields fields = response.getFields();
|
||||
assertThat(fields.size(), equalTo(1));
|
||||
Terms terms = fields.terms("field");
|
||||
assertThat(terms.size(), equalTo(8L));
|
||||
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
|
||||
assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
|
||||
assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
|
||||
TermsEnum iterator = terms.iterator();
|
||||
for (int j = 0; j < values.length; j++) {
|
||||
String string = values[j];
|
||||
BytesRef next = iterator.next();
|
||||
assertThat(next, Matchers.notNullValue());
|
||||
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
|
||||
assertThat(next, Matchers.notNullValue());
|
||||
|
||||
assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));
|
||||
|
||||
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
|
||||
assertThat(docsAndPositions.nextDoc(), equalTo(0));
|
||||
assertThat(freq[j], equalTo(docsAndPositions.freq()));
|
||||
assertThat(iterator.docFreq(), equalTo(-1));
|
||||
int[] termPos = pos[j];
|
||||
int[] termStartOffset = startOffset[j];
|
||||
int[] termEndOffset = endOffset[j];
|
||||
assertThat(termPos.length, equalTo(freq[j]));
|
||||
assertThat(termStartOffset.length, equalTo(freq[j]));
|
||||
assertThat(termEndOffset.length, equalTo(freq[j]));
|
||||
for (int k = 0; k < freq[j]; k++) {
|
||||
int nextPosition = docsAndPositions.nextPosition();
|
||||
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
|
||||
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
|
||||
}
|
||||
}
|
||||
assertThat(iterator.next(), Matchers.nullValue());
|
||||
|
||||
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
|
||||
response.toXContent(xBuilder, null);
|
||||
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
|
||||
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
|
||||
+ i
|
||||
+ "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
|
||||
assertThat(utf8, equalTo(expectedString));
|
||||
|
||||
}
|
||||
|
||||
private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i)
|
||||
throws IOException {
|
||||
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
|
||||
.setPositions(true).setFieldStatistics(true).setTermStatistics(true).setSelectedFields();
|
||||
assertThat(resp.request().fieldStatistics(), equalTo(true));
|
||||
TermVectorsResponse response = resp.execute().actionGet();
|
||||
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
|
||||
Fields fields = response.getFields();
|
||||
assertThat(fields.size(), equalTo(1));
|
||||
Terms terms = fields.terms("field");
|
||||
assertThat(terms.size(), equalTo(8L));
|
||||
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
|
||||
assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
|
||||
assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
|
||||
TermsEnum iterator = terms.iterator();
|
||||
for (int j = 0; j < values.length; j++) {
|
||||
String string = values[j];
|
||||
BytesRef next = iterator.next();
|
||||
assertThat(next, Matchers.notNullValue());
|
||||
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
|
||||
assertThat(next, Matchers.notNullValue());
|
||||
if (string.equals("the")) {
|
||||
assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
|
||||
} else {
|
||||
assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
|
||||
}
|
||||
|
||||
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
|
||||
assertThat(docsAndPositions.nextDoc(), equalTo(0));
|
||||
assertThat(freq[j], equalTo(docsAndPositions.freq()));
|
||||
assertThat(iterator.docFreq(), equalTo(numDocs));
|
||||
int[] termPos = pos[j];
|
||||
int[] termStartOffset = startOffset[j];
|
||||
int[] termEndOffset = endOffset[j];
|
||||
assertThat(termPos.length, equalTo(freq[j]));
|
||||
assertThat(termStartOffset.length, equalTo(freq[j]));
|
||||
assertThat(termEndOffset.length, equalTo(freq[j]));
|
||||
for (int k = 0; k < freq[j]; k++) {
|
||||
int nextPosition = docsAndPositions.nextPosition();
|
||||
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
|
||||
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
|
||||
}
|
||||
}
|
||||
assertThat(iterator.next(), Matchers.nullValue());
|
||||
|
||||
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
|
||||
response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
|
||||
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
|
||||
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
|
||||
+ i
|
||||
+ "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
|
||||
assertThat(utf8, equalTo(expectedString));
|
||||
}
|
||||
|
||||
}
|
|
@ -193,7 +193,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
|||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
|
||||
for (int i = 0; i < 10; i++) {
|
||||
client().prepareIndex("test", "type1", Integer.toString(i))
|
||||
.setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
|
||||
|
@ -278,7 +278,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
|||
assertAcked(prepareCreate("test").addMapping("type1", mapping)
|
||||
.setSettings(Settings.builder()
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
|
||||
for (int i = 0; i < 10; i++) {
|
||||
client().prepareIndex("test", "type1", Integer.toString(i))
|
||||
.setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
|
||||
|
@ -585,7 +585,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
|||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
|
||||
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
|
||||
|
||||
ensureGreen();
|
||||
|
||||
|
@ -645,9 +645,8 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
|||
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
|
||||
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
|
||||
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
|
||||
if (withPayloads) {
|
||||
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
|
||||
}
|
||||
// We never configure an analyzer with payloads for this test so this is never returned
|
||||
assertNull("term: " + string, docsAndPositions.getPayload());
|
||||
}
|
||||
}
|
||||
assertThat(iterator.next(), nullValue());
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
package org.elasticsearch.index.termvectors;
|
||||
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.elasticsearch.action.bulk.BulkRequestBuilder;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsRequest;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsResponse;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -28,6 +31,7 @@ import org.elasticsearch.index.shard.IndexShard;
|
|||
import org.elasticsearch.indices.IndicesService;
|
||||
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Stream;
|
||||
|
@ -71,4 +75,45 @@ public class TermVectorsServiceTests extends ESSingleNodeTestCase {
|
|||
assertThat(response, notNullValue());
|
||||
assertThat(response.getTookInMillis(), equalTo(TimeUnit.NANOSECONDS.toMillis(longs.get(1) - longs.get(0))));
|
||||
}
|
||||
|
||||
public void testDocFreqs() throws IOException {
|
||||
XContentBuilder mapping = jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("doc")
|
||||
.startObject("properties")
|
||||
.startObject("text")
|
||||
.field("type", "text")
|
||||
.field("term_vector", "with_positions_offsets_payloads")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
Settings settings = Settings.builder()
|
||||
.put("number_of_shards", 1)
|
||||
.build();
|
||||
createIndex("test", settings, "doc", mapping);
|
||||
ensureGreen();
|
||||
|
||||
int max = between(3, 10);
|
||||
BulkRequestBuilder bulk = client().prepareBulk();
|
||||
for (int i = 0; i < max; i++) {
|
||||
bulk.add(client().prepareIndex("test", "doc", Integer.toString(i))
|
||||
.setSource("text", "the quick brown fox jumped over the lazy dog"));
|
||||
}
|
||||
bulk.get();
|
||||
|
||||
TermVectorsRequest request = new TermVectorsRequest("test", "doc", "0").termStatistics(true);
|
||||
|
||||
IndicesService indicesService = getInstanceFromNode(IndicesService.class);
|
||||
IndexService test = indicesService.indexService(resolveIndex("test"));
|
||||
IndexShard shard = test.getShardOrNull(0);
|
||||
assertThat(shard, notNullValue());
|
||||
TermVectorsResponse response = TermVectorsService.getTermVectors(shard, request);
|
||||
|
||||
Terms terms = response.getFields().terms("text");
|
||||
TermsEnum iterator = terms.iterator();
|
||||
while (iterator.next() != null) {
|
||||
assertEquals(max, iterator.docFreq());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,11 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -127,12 +124,6 @@ public class AnalysisModuleTests extends ESTestCase {
|
|||
testSimpleConfiguration(settings);
|
||||
}
|
||||
|
||||
public void testDefaultFactoryTokenFilters() throws IOException {
|
||||
assertTokenFilter("keyword_repeat", KeywordRepeatFilter.class);
|
||||
assertTokenFilter("persian_normalization", PersianNormalizationFilter.class);
|
||||
assertTokenFilter("arabic_normalization", ArabicNormalizationFilter.class);
|
||||
}
|
||||
|
||||
public void testAnalyzerAliasNotAllowedPost5x() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.analyzer.foobar.type", "standard")
|
||||
|
|
|
@ -1550,30 +1550,6 @@ public class SearchQueryIT extends ESIntegTestCase {
|
|||
assertHitCount(searchResponse, 2);
|
||||
}
|
||||
|
||||
public void testMatchQueryWithStackedStems() throws IOException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.index.type", "custom")
|
||||
.put("index.analysis.analyzer.index.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.index.filter", "lowercase")
|
||||
.put("index.analysis.analyzer.search.type", "custom")
|
||||
.put("index.analysis.analyzer.search.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.search.filter", "lowercase", "keyword_repeat", "porter_stem", "unique_stem")
|
||||
.put("index.analysis.filter.unique_stem.type", "unique")
|
||||
.put("index.analysis.filter.unique_stem.only_on_same_position", true));
|
||||
assertAcked(builder.addMapping("test", "text", "type=text,analyzer=index,search_analyzer=search"));
|
||||
|
||||
client().prepareIndex("test", "test", "1").setSource("text", "the fox runs across the street").get();
|
||||
refresh();
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get();
|
||||
assertHitCount(searchResponse, 1);
|
||||
|
||||
client().prepareIndex("test", "test", "2").setSource("text", "run fox run").get();
|
||||
refresh();
|
||||
searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get();
|
||||
assertHitCount(searchResponse, 2);
|
||||
}
|
||||
|
||||
public void testQueryStringWithSynonyms() throws IOException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
|
|
|
@ -694,107 +694,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
|||
assertSuggestion(searchSuggest, 0, "simple_phrase", "xorr the god jewel");
|
||||
}
|
||||
|
||||
public void testPhraseBoundaryCases() throws IOException, URISyntaxException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
|
||||
.put("index.analysis.analyzer.body.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.body.filter", "lowercase")
|
||||
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase")
|
||||
.put("index.analysis.analyzer.ngram.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase")
|
||||
.put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase")
|
||||
.put("index.analysis.filter.my_shingle.type", "shingle")
|
||||
.put("index.analysis.filter.my_shingle.output_unigrams", false)
|
||||
.put("index.analysis.filter.my_shingle.min_shingle_size", 2)
|
||||
.put("index.analysis.filter.my_shingle.max_shingle_size", 2)
|
||||
.put("index.analysis.filter.my_shingle2.type", "shingle")
|
||||
.put("index.analysis.filter.my_shingle2.output_unigrams", true)
|
||||
.put("index.analysis.filter.my_shingle2.min_shingle_size", 2)
|
||||
.put("index.analysis.filter.my_shingle2.max_shingle_size", 2));
|
||||
|
||||
XContentBuilder mapping = XContentFactory.jsonBuilder()
|
||||
.startObject().startObject("type1")
|
||||
.startObject("properties")
|
||||
.startObject("body").field("type", "text").field("analyzer", "body").endObject()
|
||||
.startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
|
||||
.startObject("ngram").field("type", "text").field("analyzer", "ngram").endObject()
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
assertAcked(builder.addMapping("type1", mapping));
|
||||
ensureGreen();
|
||||
|
||||
String[] strings = new String[]{
|
||||
"Xorr the God-Jewel",
|
||||
"Grog the God-Crusher",
|
||||
"Xorn",
|
||||
"Walter Newell",
|
||||
"Wanda Maximoff",
|
||||
"Captain America",
|
||||
"American Ace",
|
||||
"Wundarr the Aquarian",
|
||||
"Will o' the Wisp",
|
||||
"Xemnu the Titan"
|
||||
};
|
||||
for (String line : strings) {
|
||||
index("test", "type1", line, "body", line, "bigram", line, "ngram", line);
|
||||
}
|
||||
refresh();
|
||||
|
||||
NumShards numShards = getNumShards("test");
|
||||
|
||||
// Lets make sure some things throw exceptions
|
||||
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")
|
||||
.analyzer("body")
|
||||
.addCandidateGenerator(candidateGenerator("does_not_exist").minWordLength(1).suggestMode("always"))
|
||||
.realWordErrorLikelihood(0.95f)
|
||||
.maxErrors(0.5f)
|
||||
.size(1);
|
||||
phraseSuggestion.clearCandidateGenerators().analyzer(null);
|
||||
try {
|
||||
searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
|
||||
fail("analyzer does only produce ngrams");
|
||||
} catch (SearchPhaseExecutionException e) {
|
||||
}
|
||||
|
||||
phraseSuggestion.analyzer("bigram");
|
||||
try {
|
||||
searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
|
||||
fail("analyzer does only produce ngrams");
|
||||
} catch (SearchPhaseExecutionException e) {
|
||||
}
|
||||
|
||||
// Now we'll make sure some things don't
|
||||
phraseSuggestion.forceUnigrams(false);
|
||||
searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
|
||||
|
||||
// Field doesn't produce unigrams but the analyzer does
|
||||
phraseSuggestion.forceUnigrams(true).analyzer("ngram");
|
||||
searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
|
||||
|
||||
phraseSuggestion = phraseSuggestion("ngram")
|
||||
.analyzer("myDefAnalyzer")
|
||||
.forceUnigrams(true)
|
||||
.realWordErrorLikelihood(0.95f)
|
||||
.maxErrors(0.5f)
|
||||
.size(1)
|
||||
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
|
||||
Suggest suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
|
||||
|
||||
// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
|
||||
// earlier term (xorn):
|
||||
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
|
||||
|
||||
phraseSuggestion.analyzer(null);
|
||||
suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
|
||||
|
||||
// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
|
||||
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
|
||||
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
|
||||
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
|
||||
}
|
||||
|
||||
public void testDifferentShardSize() throws Exception {
|
||||
createIndex("test");
|
||||
ensureGreen();
|
||||
|
|
|
@ -21,13 +21,31 @@ package org.elasticsearch.analysis.common;
|
|||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.UpperCaseFilter;
|
||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
|
||||
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||
import org.apache.lucene.analysis.en.KStemFilter;
|
||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
|
||||
|
@ -35,16 +53,25 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
|||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.ClassicFilter;
|
||||
import org.apache.lucene.analysis.tr.ApostropheFilter;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.tartarus.snowball.ext.DutchStemmer;
|
||||
import org.tartarus.snowball.ext.FrenchStemmer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -74,29 +101,61 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
|||
@Override
|
||||
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
||||
List<PreConfiguredTokenFilter> filters = new ArrayList<>();
|
||||
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
|
||||
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
|
||||
new DelimitedPayloadTokenFilter(input,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
|
||||
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
|
||||
// TODO deprecate edgeNGram
|
||||
filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input ->
|
||||
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("elision", true,
|
||||
input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
|
||||
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
|
||||
filters.add(PreConfiguredTokenFilter.singleton("limit", false, input ->
|
||||
new LimitTokenCountFilter(input,
|
||||
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
|
||||
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
|
||||
// TODO deprecate nGram
|
||||
filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, ShingleFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
|
||||
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
|
||||
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
|
||||
|
|
|
@ -20,6 +20,8 @@
|
|||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
|
||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
|
@ -68,22 +70,46 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
@Override
|
||||
protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
|
||||
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
|
||||
filters.put("apostrophe", null);
|
||||
filters.put("arabic_normalization", null);
|
||||
filters.put("arabic_stem", null);
|
||||
filters.put("asciifolding", null);
|
||||
filters.put("brazilian_stem", null);
|
||||
filters.put("cjk_bigram", null);
|
||||
filters.put("cjk_width", null);
|
||||
filters.put("classic", null);
|
||||
filters.put("common_grams", null);
|
||||
filters.put("czech_stem", null);
|
||||
filters.put("decimal_digit", null);
|
||||
filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
|
||||
filters.put("dutch_stem", SnowballPorterFilterFactory.class);
|
||||
filters.put("edge_ngram", null);
|
||||
filters.put("edgeNGram", null);
|
||||
filters.put("elision", null);
|
||||
filters.put("french_stem", SnowballPorterFilterFactory.class);
|
||||
filters.put("german_stem", null);
|
||||
filters.put("hindi_normalization", null);
|
||||
filters.put("indic_normalization", null);
|
||||
filters.put("keyword_repeat", null);
|
||||
filters.put("kstem", null);
|
||||
filters.put("length", null);
|
||||
filters.put("limit", LimitTokenCountFilterFactory.class);
|
||||
filters.put("ngram", null);
|
||||
filters.put("nGram", null);
|
||||
filters.put("persian_normalization", null);
|
||||
filters.put("porter_stem", null);
|
||||
filters.put("reverse", ReverseStringFilterFactory.class);
|
||||
filters.put("russian_stem", SnowballPorterFilterFactory.class);
|
||||
filters.put("scandinavian_normalization", null);
|
||||
filters.put("scandinavian_folding", null);
|
||||
filters.put("shingle", null);
|
||||
filters.put("snowball", SnowballPorterFilterFactory.class);
|
||||
filters.put("sorani_normalization", null);
|
||||
filters.put("stemmer", PorterStemFilterFactory.class);
|
||||
filters.put("stop", null);
|
||||
filters.put("trim", null);
|
||||
filters.put("truncate", null);
|
||||
filters.put("type_as_payload", null);
|
||||
filters.put("unique", Void.class);
|
||||
filters.put("uppercase", null);
|
||||
filters.put("word_delimiter", null);
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
# integration tests for queries with specific analysis chains
|
||||
|
||||
"match query with stacked stems":
|
||||
# Tests the match query stemmed tokens are "stacked" on top of the unstemmed
|
||||
# versions in the same position.
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 1
|
||||
number_of_replicas: 1
|
||||
analysis:
|
||||
analyzer:
|
||||
index:
|
||||
tokenizer: standard
|
||||
filter: [lowercase]
|
||||
search:
|
||||
tokenizer: standard
|
||||
filter: [lowercase, keyword_repeat, porter_stem, unique_stem]
|
||||
filter:
|
||||
unique_stem:
|
||||
type: unique
|
||||
only_on_same_position: true
|
||||
mappings:
|
||||
doc:
|
||||
properties:
|
||||
body:
|
||||
type: text
|
||||
analyzer: index
|
||||
search_analyzer: search
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
type: doc
|
||||
id: 1
|
||||
body: { "text": "the fox runs across the street" }
|
||||
refresh: true
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
text: fox runs
|
||||
operator: AND
|
||||
- match: {hits.count: 1}
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
type: doc
|
||||
id: 2
|
||||
body: { "text": "run fox run" }
|
||||
refresh: true
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
text: fox runs
|
||||
operator: AND
|
||||
- match: {hits.count: 2}
|
|
@ -0,0 +1,158 @@
|
|||
# Integration tests for the phrase suggester with a few analyzers
|
||||
|
||||
setup:
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 1
|
||||
number_of_replicas: 1
|
||||
analysis:
|
||||
analyzer:
|
||||
body:
|
||||
tokenizer: standard
|
||||
filter: [lowercase]
|
||||
bigram:
|
||||
tokenizer: standard
|
||||
filter: [lowercase, bigram]
|
||||
ngram:
|
||||
tokenizer: standard
|
||||
filter: [lowercase, ngram]
|
||||
filter:
|
||||
bigram:
|
||||
type: shingle
|
||||
output_unigrams: false
|
||||
min_shingle_size: 2
|
||||
max_shingle_size: 2
|
||||
ngram:
|
||||
type: shingle
|
||||
output_unigrams: true
|
||||
min_shingle_size: 2
|
||||
max_shingle_size: 2
|
||||
mappings:
|
||||
doc:
|
||||
properties:
|
||||
body:
|
||||
type: text
|
||||
analyzer: body
|
||||
fields:
|
||||
bigram:
|
||||
type: text
|
||||
analyzer: bigram
|
||||
ngram:
|
||||
type: text
|
||||
analyzer: ngram
|
||||
|
||||
- do:
|
||||
bulk:
|
||||
index: test
|
||||
type: doc
|
||||
refresh: true
|
||||
body: |
|
||||
{ "index": {} }
|
||||
{ "body": "Xorr the God-Jewel" }
|
||||
{ "index": {} }
|
||||
{ "body": "Xorn" }
|
||||
|
||||
---
|
||||
"sorts by score":
|
||||
- do:
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: xor the got-jewel
|
||||
test:
|
||||
phrase:
|
||||
field: body.ngram
|
||||
force_unigrams: true
|
||||
max_errors: 0.5
|
||||
direct_generator:
|
||||
- field: body.ngram
|
||||
min_word_length: 1
|
||||
suggest_mode: always
|
||||
|
||||
- match: {suggest.test.0.options.0.text: xorr the god jewel}
|
||||
- match: {suggest.test.0.options.1.text: xorn the god jewel}
|
||||
|
||||
---
|
||||
"breaks ties by sorting terms":
|
||||
# This runs the suggester without bigrams so we can be sure of the sort order
|
||||
- do:
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: xor the got-jewel
|
||||
test:
|
||||
phrase:
|
||||
field: body
|
||||
analyzer: body
|
||||
force_unigrams: true
|
||||
max_errors: 0.5
|
||||
direct_generator:
|
||||
- field: body
|
||||
min_word_length: 1
|
||||
suggest_mode: always
|
||||
|
||||
# The scores are identical but xorn comes first because it sorts first
|
||||
- match: {suggest.test.0.options.0.text: xorn the god jewel}
|
||||
- match: {suggest.test.0.options.1.text: xorr the god jewel}
|
||||
- match: {suggest.test.0.options.0.score: $body.suggest.test.0.options.0.score}
|
||||
|
||||
---
|
||||
"fails when asked to run on a field without unigrams":
|
||||
- do:
|
||||
catch: /since it doesn't emit unigrams/
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: xor the got-jewel
|
||||
test:
|
||||
phrase:
|
||||
field: body.bigram
|
||||
|
||||
- do:
|
||||
catch: /since it doesn't emit unigrams/
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: xor the got-jewel
|
||||
test:
|
||||
phrase:
|
||||
field: body.bigram
|
||||
analyzer: bigram
|
||||
|
||||
---
|
||||
"doesn't fail when asked to run on a field without unigrams when force_unigrams=false":
|
||||
- do:
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: xor the got-jewel
|
||||
test:
|
||||
phrase:
|
||||
field: body.bigram
|
||||
force_unigrams: false
|
||||
|
||||
- do:
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: xor the got-jewel
|
||||
test:
|
||||
phrase:
|
||||
field: body.bigram
|
||||
analyzer: bigram
|
||||
force_unigrams: false
|
|
@ -0,0 +1,40 @@
|
|||
"term vectors with payloads tests":
|
||||
# Tests term vectors with payloads. This is in the analysis-common module
|
||||
# because there are no token filters that support payloads in core.
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
mappings:
|
||||
doc:
|
||||
properties:
|
||||
text:
|
||||
type: text
|
||||
term_vector: with_positions_offsets_payloads
|
||||
analyzer: has_payloads
|
||||
settings:
|
||||
number_of_shards: 1
|
||||
number_of_replicas: 1
|
||||
analysis:
|
||||
analyzer:
|
||||
had_payloads:
|
||||
tokenizer: standard
|
||||
filter: [type_as_payload]
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
type: doc
|
||||
id: 1
|
||||
refresh: true
|
||||
body:
|
||||
text: The quick brown fox is brown.
|
||||
|
||||
- do:
|
||||
termvectors:
|
||||
index: test
|
||||
type: doc
|
||||
id: 1
|
||||
payloads: true
|
||||
- match: {term_vectors.text.field_statistics.sum_doc_freq: 5}
|
||||
- match: {term_vectors.text.terms.brown.tokens.0.payload: 10}
|
|
@ -19,8 +19,6 @@
|
|||
|
||||
package org.elasticsearch.indices.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
|
@ -97,7 +95,6 @@ import java.util.Collection;
|
|||
import java.util.EnumMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
@ -343,29 +340,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
Map<String, Class<?>> filters = new HashMap<>();
|
||||
filters.put("standard", null);
|
||||
filters.put("lowercase", null);
|
||||
// TODO remove the loop below once all the tokenizers are migrated out of PreBuiltTokenFilters
|
||||
for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) {
|
||||
Class<?> luceneFactoryClass;
|
||||
switch (tokenizer) {
|
||||
case LOWERCASE:
|
||||
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
|
||||
continue;
|
||||
case DUTCH_STEM:
|
||||
case FRENCH_STEM:
|
||||
case RUSSIAN_STEM:
|
||||
luceneFactoryClass = SnowballPorterFilterFactory.class;
|
||||
break;
|
||||
case DELIMITED_PAYLOAD_FILTER:
|
||||
luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class;
|
||||
break;
|
||||
case LIMIT:
|
||||
luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class;
|
||||
break;
|
||||
default:
|
||||
luceneFactoryClass = null;
|
||||
}
|
||||
filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass);
|
||||
}
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue