diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 73a47f5f19f..19024adc61c 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -481,6 +481,7 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java new file mode 100644 index 00000000000..0318a2eedea --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java @@ -0,0 +1,75 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.*; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * + */ +public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory { + + static final char DEFAULT_DELIMITER = '|'; + static final PayloadEncoder DEFAULT_ENCODER = new FloatEncoder(); + + static final String ENCODING = "encoding"; + static final String DELIMITER = "delimiter"; + + char delimiter; + PayloadEncoder encoder; + + @Inject + public DelimitedPayloadTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, + @Assisted Settings settings) { + super(index, indexSettings, name, settings); + String delimiterConf = settings.get(DELIMITER); + if (delimiterConf != null) { + delimiter = delimiterConf.charAt(0); + } else { + delimiter = DEFAULT_DELIMITER; + } + + if (settings.get(ENCODING) != null) { + if (settings.get(ENCODING).equals("float")) { + encoder = new FloatEncoder(); + } else if (settings.get(ENCODING).equals("int")) { + encoder = new IntegerEncoder(); + } else if (settings.get(ENCODING).equals("identity")) { + encoder = new IdentityEncoder(); + } + } else { + encoder = DEFAULT_ENCODER; + } + } + + @Override + public TokenStream create(TokenStream tokenStream) { + DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder); + return filter; + } + +} diff --git a/src/test/java/org/elasticsearch/termvectors/GetTermVectorTests.java b/src/test/java/org/elasticsearch/termvectors/GetTermVectorTests.java index 125c8f9af28..59919c8ffe4 100644 --- a/src/test/java/org/elasticsearch/termvectors/GetTermVectorTests.java +++ b/src/test/java/org/elasticsearch/termvectors/GetTermVectorTests.java @@ -19,6 +19,8 @@ package org.elasticsearch.termvectors; +import com.carrotsearch.hppc.ObjectIntOpenHashMap; +import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; @@ -36,6 +38,10 @@ import org.hamcrest.Matchers; import org.junit.Test; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows; import static org.hamcrest.Matchers.equalTo; @@ -355,8 +361,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests { createIndexBasedOnFieldSettings(testFieldSettings, -1); TestDoc[] testDocs = generateTestDocs(5, testFieldSettings); -// for (int i=0;i> payloads = createPayloads(tokens, encoding); + String delimiter = createRandomDelimiter(tokens); + String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0)); + //create the mapping + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties") + .startObject("field").field("type", "string").field("term_vector", "with_positions_offsets_payloads") + .field("analyzer", "payload_test").endObject().endObject().endObject().endObject(); + ElasticsearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings( + ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_test.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter") + .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter) + .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString) + .put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter"))); + ensureYellow(); + + client().prepareIndex("test", "type1", Integer.toString(1)) + .setSource(XContentFactory.jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet(); + refresh(); + TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true) + .setPositions(true).setSelectedFields(); + TermVectorResponse response = resp.execute().actionGet(); + assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true)); + Fields fields = response.getFields(); + assertThat(fields.size(), equalTo(1)); + Terms terms = fields.terms("field"); + TermsEnum iterator = terms.iterator(null); + while (iterator.next() != null) { + String term = iterator.term().utf8ToString(); + DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); + assertThat(docsAndPositions.nextDoc(), equalTo(0)); + List curPayloads = payloads.get(term); + assertThat(term, curPayloads, Matchers.notNullValue()); + assert docsAndPositions != null; + for (int k = 0; k < docsAndPositions.freq(); k++) { + docsAndPositions.nextPosition(); + if (docsAndPositions.getPayload()!=null){ + String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString(); + assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k))); + } else { + String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString(); + assertThat(infoString, curPayloads.get(k).length, equalTo(0)); + } + } + } + assertThat(iterator.next(), Matchers.nullValue()); + } + private String createRandomDelimiter(String[] tokens) { + String delimiter = ""; + boolean isTokenOrWhitespace = true; + while(isTokenOrWhitespace) { + isTokenOrWhitespace = false; + delimiter = randomUnicodeOfLength(1); + for(String token:tokens) { + if(token.contains(delimiter)) { + isTokenOrWhitespace = true; + } + } + if(Character.isWhitespace(delimiter.charAt(0))) { + isTokenOrWhitespace = true; + } + } + return delimiter; + } + private String createString(String[] tokens, Map> payloads, int encoding, char delimiter) { + String resultString = ""; + ObjectIntOpenHashMap payloadCounter = new ObjectIntOpenHashMap(); + for (String token : tokens) { + if (!payloadCounter.containsKey(token)) { + payloadCounter.putIfAbsent(token, 0); + } else { + payloadCounter.put(token, payloadCounter.get(token) + 1); + } + resultString = resultString + token; + BytesRef payload = payloads.get(token).get(payloadCounter.get(token)); + if (payload.length > 0) { + resultString = resultString + delimiter; + switch (encoding) { + case 0: { + resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset)); + break; + } + case 1: { + resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset)); + break; + } + case 2: { + resultString = resultString + payload.utf8ToString(); + break; + } + default: { + throw new ElasticSearchException("unsupported encoding type"); + } + } + } + resultString = resultString + " "; + } + return resultString; + } + + private Map> createPayloads(String[] tokens, int encoding) { + Map> payloads = new HashMap>(); + for (String token : tokens) { + if (payloads.get(token) == null) { + payloads.put(token, new ArrayList()); + } + boolean createPayload = randomBoolean(); + if (createPayload) { + switch (encoding) { + case 0: { + float theFloat = randomFloat(); + payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat))); + break; + } + case 1: { + payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt()))); + break; + } + case 2: { + String payload = randomUnicodeOfLengthBetween(50, 100); + for (int c = 0; c < payload.length(); c++) { + if (Character.isWhitespace(payload.charAt(c))) { + payload = payload.replace(payload.charAt(c), 'w'); + } + } + payloads.get(token).add(new BytesRef(payload)); + break; + } + default: { + throw new ElasticSearchException("unsupported encoding type"); + } + } + } else { + payloads.get(token).add(new BytesRef()); + } + } + return payloads; + } + + private String[] crateRandomTokens() { + String[] tokens = { "the", "quick", "brown", "fox" }; + int numTokensWithDuplicates = randomIntBetween(3, 15); + String[] finalTokens = new String[numTokensWithDuplicates]; + for (int i = 0; i < numTokensWithDuplicates; i++) { + finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)]; + } + return finalTokens; + } }