enable delimited payload token filter

closes #3859
This commit is contained in:
Britta Weber 2013-10-08 18:04:33 +02:00
parent ce0ab79155
commit 4e7a1788ea
3 changed files with 246 additions and 2 deletions

View File

@ -481,6 +481,7 @@ public class AnalysisModule extends AbstractModule {
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class); tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);

View File

@ -0,0 +1,75 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.payloads.*;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
/**
*
*/
public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory {
static final char DEFAULT_DELIMITER = '|';
static final PayloadEncoder DEFAULT_ENCODER = new FloatEncoder();
static final String ENCODING = "encoding";
static final String DELIMITER = "delimiter";
char delimiter;
PayloadEncoder encoder;
@Inject
public DelimitedPayloadTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name,
@Assisted Settings settings) {
super(index, indexSettings, name, settings);
String delimiterConf = settings.get(DELIMITER);
if (delimiterConf != null) {
delimiter = delimiterConf.charAt(0);
} else {
delimiter = DEFAULT_DELIMITER;
}
if (settings.get(ENCODING) != null) {
if (settings.get(ENCODING).equals("float")) {
encoder = new FloatEncoder();
} else if (settings.get(ENCODING).equals("int")) {
encoder = new IntegerEncoder();
} else if (settings.get(ENCODING).equals("identity")) {
encoder = new IdentityEncoder();
}
} else {
encoder = DEFAULT_ENCODER;
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder);
return filter;
}
}

View File

@ -19,6 +19,8 @@
package org.elasticsearch.termvectors; package org.elasticsearch.termvectors;
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.document.FieldType; import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -36,6 +38,10 @@ import org.hamcrest.Matchers;
import org.junit.Test; import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
@ -355,8 +361,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
createIndexBasedOnFieldSettings(testFieldSettings, -1); createIndexBasedOnFieldSettings(testFieldSettings, -1);
TestDoc[] testDocs = generateTestDocs(5, testFieldSettings); TestDoc[] testDocs = generateTestDocs(5, testFieldSettings);
// for (int i=0;i<testDocs.length;i++)
// logger.info("Doc: {}",testDocs[i]);
DirectoryReader directoryReader = indexDocsWithLucene(testDocs); DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings); TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
@ -376,4 +380,168 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
} }
} }
} }
@Test
public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws ElasticSearchException, IOException {
//create the test document
int encoding = randomIntBetween(0, 2);
String encodingString = "";
if (encoding == 0) {
encodingString = "float";
}
if (encoding == 1) {
encodingString = "int";
}
if (encoding == 2) {
encodingString = "identity";
}
String[] tokens = crateRandomTokens();
Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
String delimiter = createRandomDelimiter(tokens);
String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
//create the mapping
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field").field("type", "string").field("term_vector", "with_positions_offsets_payloads")
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
ElasticsearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
.put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
.put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
.put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
ensureYellow();
client().prepareIndex("test", "type1", Integer.toString(1))
.setSource(XContentFactory.jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
refresh();
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true)
.setPositions(true).setSelectedFields();
TermVectorResponse response = resp.execute().actionGet();
assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
TermsEnum iterator = terms.iterator(null);
while (iterator.next() != null) {
String term = iterator.term().utf8ToString();
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
List<BytesRef> curPayloads = payloads.get(term);
assertThat(term, curPayloads, Matchers.notNullValue());
assert docsAndPositions != null;
for (int k = 0; k < docsAndPositions.freq(); k++) {
docsAndPositions.nextPosition();
if (docsAndPositions.getPayload()!=null){
String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString();
assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
} else {
String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
assertThat(infoString, curPayloads.get(k).length, equalTo(0));
}
}
}
assertThat(iterator.next(), Matchers.nullValue());
}
private String createRandomDelimiter(String[] tokens) {
String delimiter = "";
boolean isTokenOrWhitespace = true;
while(isTokenOrWhitespace) {
isTokenOrWhitespace = false;
delimiter = randomUnicodeOfLength(1);
for(String token:tokens) {
if(token.contains(delimiter)) {
isTokenOrWhitespace = true;
}
}
if(Character.isWhitespace(delimiter.charAt(0))) {
isTokenOrWhitespace = true;
}
}
return delimiter;
}
private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
String resultString = "";
ObjectIntOpenHashMap<String> payloadCounter = new ObjectIntOpenHashMap<String>();
for (String token : tokens) {
if (!payloadCounter.containsKey(token)) {
payloadCounter.putIfAbsent(token, 0);
} else {
payloadCounter.put(token, payloadCounter.get(token) + 1);
}
resultString = resultString + token;
BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
if (payload.length > 0) {
resultString = resultString + delimiter;
switch (encoding) {
case 0: {
resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
break;
}
case 1: {
resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
break;
}
case 2: {
resultString = resultString + payload.utf8ToString();
break;
}
default: {
throw new ElasticSearchException("unsupported encoding type");
}
}
}
resultString = resultString + " ";
}
return resultString;
}
private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
Map<String, List<BytesRef>> payloads = new HashMap<String, List<BytesRef>>();
for (String token : tokens) {
if (payloads.get(token) == null) {
payloads.put(token, new ArrayList<BytesRef>());
}
boolean createPayload = randomBoolean();
if (createPayload) {
switch (encoding) {
case 0: {
float theFloat = randomFloat();
payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
break;
}
case 1: {
payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
break;
}
case 2: {
String payload = randomUnicodeOfLengthBetween(50, 100);
for (int c = 0; c < payload.length(); c++) {
if (Character.isWhitespace(payload.charAt(c))) {
payload = payload.replace(payload.charAt(c), 'w');
}
}
payloads.get(token).add(new BytesRef(payload));
break;
}
default: {
throw new ElasticSearchException("unsupported encoding type");
}
}
} else {
payloads.get(token).add(new BytesRef());
}
}
return payloads;
}
private String[] crateRandomTokens() {
String[] tokens = { "the", "quick", "brown", "fox" };
int numTokensWithDuplicates = randomIntBetween(3, 15);
String[] finalTokens = new String[numTokensWithDuplicates];
for (int i = 0; i < numTokensWithDuplicates; i++) {
finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
}
return finalTokens;
}
} }