parent
ce0ab79155
commit
4e7a1788ea
|
@ -481,6 +481,7 @@ public class AnalysisModule extends AbstractModule {
|
||||||
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
||||||
|
tokenFiltersBindings.processTokenFilter("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
|
||||||
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
* Licensed to ElasticSearch and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. ElasticSearch licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.payloads.*;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
|
static final char DEFAULT_DELIMITER = '|';
|
||||||
|
static final PayloadEncoder DEFAULT_ENCODER = new FloatEncoder();
|
||||||
|
|
||||||
|
static final String ENCODING = "encoding";
|
||||||
|
static final String DELIMITER = "delimiter";
|
||||||
|
|
||||||
|
char delimiter;
|
||||||
|
PayloadEncoder encoder;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DelimitedPayloadTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name,
|
||||||
|
@Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name, settings);
|
||||||
|
String delimiterConf = settings.get(DELIMITER);
|
||||||
|
if (delimiterConf != null) {
|
||||||
|
delimiter = delimiterConf.charAt(0);
|
||||||
|
} else {
|
||||||
|
delimiter = DEFAULT_DELIMITER;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (settings.get(ENCODING) != null) {
|
||||||
|
if (settings.get(ENCODING).equals("float")) {
|
||||||
|
encoder = new FloatEncoder();
|
||||||
|
} else if (settings.get(ENCODING).equals("int")) {
|
||||||
|
encoder = new IntegerEncoder();
|
||||||
|
} else if (settings.get(ENCODING).equals("identity")) {
|
||||||
|
encoder = new IdentityEncoder();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
encoder = DEFAULT_ENCODER;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder);
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -19,6 +19,8 @@
|
||||||
|
|
||||||
package org.elasticsearch.termvectors;
|
package org.elasticsearch.termvectors;
|
||||||
|
|
||||||
|
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
|
||||||
|
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.index.*;
|
import org.apache.lucene.index.*;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -36,6 +38,10 @@ import org.hamcrest.Matchers;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
@ -355,8 +361,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
|
||||||
createIndexBasedOnFieldSettings(testFieldSettings, -1);
|
createIndexBasedOnFieldSettings(testFieldSettings, -1);
|
||||||
TestDoc[] testDocs = generateTestDocs(5, testFieldSettings);
|
TestDoc[] testDocs = generateTestDocs(5, testFieldSettings);
|
||||||
|
|
||||||
// for (int i=0;i<testDocs.length;i++)
|
|
||||||
// logger.info("Doc: {}",testDocs[i]);
|
|
||||||
DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
|
DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
|
||||||
TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
|
TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
|
||||||
|
|
||||||
|
@ -376,4 +380,168 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws ElasticSearchException, IOException {
|
||||||
|
|
||||||
|
//create the test document
|
||||||
|
int encoding = randomIntBetween(0, 2);
|
||||||
|
String encodingString = "";
|
||||||
|
if (encoding == 0) {
|
||||||
|
encodingString = "float";
|
||||||
|
}
|
||||||
|
if (encoding == 1) {
|
||||||
|
encodingString = "int";
|
||||||
|
}
|
||||||
|
if (encoding == 2) {
|
||||||
|
encodingString = "identity";
|
||||||
|
}
|
||||||
|
String[] tokens = crateRandomTokens();
|
||||||
|
Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
|
||||||
|
String delimiter = createRandomDelimiter(tokens);
|
||||||
|
String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
|
||||||
|
//create the mapping
|
||||||
|
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties")
|
||||||
|
.startObject("field").field("type", "string").field("term_vector", "with_positions_offsets_payloads")
|
||||||
|
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
|
||||||
|
ElasticsearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(
|
||||||
|
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
|
||||||
|
.putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
|
||||||
|
.put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
|
||||||
|
.put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
|
||||||
|
.put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
|
||||||
|
ensureYellow();
|
||||||
|
|
||||||
|
client().prepareIndex("test", "type1", Integer.toString(1))
|
||||||
|
.setSource(XContentFactory.jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
|
||||||
|
refresh();
|
||||||
|
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true)
|
||||||
|
.setPositions(true).setSelectedFields();
|
||||||
|
TermVectorResponse response = resp.execute().actionGet();
|
||||||
|
assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
|
||||||
|
Fields fields = response.getFields();
|
||||||
|
assertThat(fields.size(), equalTo(1));
|
||||||
|
Terms terms = fields.terms("field");
|
||||||
|
TermsEnum iterator = terms.iterator(null);
|
||||||
|
while (iterator.next() != null) {
|
||||||
|
String term = iterator.term().utf8ToString();
|
||||||
|
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
|
||||||
|
assertThat(docsAndPositions.nextDoc(), equalTo(0));
|
||||||
|
List<BytesRef> curPayloads = payloads.get(term);
|
||||||
|
assertThat(term, curPayloads, Matchers.notNullValue());
|
||||||
|
assert docsAndPositions != null;
|
||||||
|
for (int k = 0; k < docsAndPositions.freq(); k++) {
|
||||||
|
docsAndPositions.nextPosition();
|
||||||
|
if (docsAndPositions.getPayload()!=null){
|
||||||
|
String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString();
|
||||||
|
assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
|
||||||
|
} else {
|
||||||
|
String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
|
||||||
|
assertThat(infoString, curPayloads.get(k).length, equalTo(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertThat(iterator.next(), Matchers.nullValue());
|
||||||
|
}
|
||||||
|
private String createRandomDelimiter(String[] tokens) {
|
||||||
|
String delimiter = "";
|
||||||
|
boolean isTokenOrWhitespace = true;
|
||||||
|
while(isTokenOrWhitespace) {
|
||||||
|
isTokenOrWhitespace = false;
|
||||||
|
delimiter = randomUnicodeOfLength(1);
|
||||||
|
for(String token:tokens) {
|
||||||
|
if(token.contains(delimiter)) {
|
||||||
|
isTokenOrWhitespace = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(Character.isWhitespace(delimiter.charAt(0))) {
|
||||||
|
isTokenOrWhitespace = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return delimiter;
|
||||||
|
}
|
||||||
|
private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
|
||||||
|
String resultString = "";
|
||||||
|
ObjectIntOpenHashMap<String> payloadCounter = new ObjectIntOpenHashMap<String>();
|
||||||
|
for (String token : tokens) {
|
||||||
|
if (!payloadCounter.containsKey(token)) {
|
||||||
|
payloadCounter.putIfAbsent(token, 0);
|
||||||
|
} else {
|
||||||
|
payloadCounter.put(token, payloadCounter.get(token) + 1);
|
||||||
|
}
|
||||||
|
resultString = resultString + token;
|
||||||
|
BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
|
||||||
|
if (payload.length > 0) {
|
||||||
|
resultString = resultString + delimiter;
|
||||||
|
switch (encoding) {
|
||||||
|
case 0: {
|
||||||
|
resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 1: {
|
||||||
|
resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 2: {
|
||||||
|
resultString = resultString + payload.utf8ToString();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
throw new ElasticSearchException("unsupported encoding type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resultString = resultString + " ";
|
||||||
|
}
|
||||||
|
return resultString;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
|
||||||
|
Map<String, List<BytesRef>> payloads = new HashMap<String, List<BytesRef>>();
|
||||||
|
for (String token : tokens) {
|
||||||
|
if (payloads.get(token) == null) {
|
||||||
|
payloads.put(token, new ArrayList<BytesRef>());
|
||||||
|
}
|
||||||
|
boolean createPayload = randomBoolean();
|
||||||
|
if (createPayload) {
|
||||||
|
switch (encoding) {
|
||||||
|
case 0: {
|
||||||
|
float theFloat = randomFloat();
|
||||||
|
payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 1: {
|
||||||
|
payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 2: {
|
||||||
|
String payload = randomUnicodeOfLengthBetween(50, 100);
|
||||||
|
for (int c = 0; c < payload.length(); c++) {
|
||||||
|
if (Character.isWhitespace(payload.charAt(c))) {
|
||||||
|
payload = payload.replace(payload.charAt(c), 'w');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
payloads.get(token).add(new BytesRef(payload));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
throw new ElasticSearchException("unsupported encoding type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
payloads.get(token).add(new BytesRef());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return payloads;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String[] crateRandomTokens() {
|
||||||
|
String[] tokens = { "the", "quick", "brown", "fox" };
|
||||||
|
int numTokensWithDuplicates = randomIntBetween(3, 15);
|
||||||
|
String[] finalTokens = new String[numTokensWithDuplicates];
|
||||||
|
for (int i = 0; i < numTokensWithDuplicates; i++) {
|
||||||
|
finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
|
||||||
|
}
|
||||||
|
return finalTokens;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue