parent
ce0ab79155
commit
4e7a1788ea
|
@ -481,6 +481,7 @@ public class AnalysisModule extends AbstractModule {
|
|||
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
static final char DEFAULT_DELIMITER = '|';
|
||||
static final PayloadEncoder DEFAULT_ENCODER = new FloatEncoder();
|
||||
|
||||
static final String ENCODING = "encoding";
|
||||
static final String DELIMITER = "delimiter";
|
||||
|
||||
char delimiter;
|
||||
PayloadEncoder encoder;
|
||||
|
||||
@Inject
|
||||
public DelimitedPayloadTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name,
|
||||
@Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
String delimiterConf = settings.get(DELIMITER);
|
||||
if (delimiterConf != null) {
|
||||
delimiter = delimiterConf.charAt(0);
|
||||
} else {
|
||||
delimiter = DEFAULT_DELIMITER;
|
||||
}
|
||||
|
||||
if (settings.get(ENCODING) != null) {
|
||||
if (settings.get(ENCODING).equals("float")) {
|
||||
encoder = new FloatEncoder();
|
||||
} else if (settings.get(ENCODING).equals("int")) {
|
||||
encoder = new IntegerEncoder();
|
||||
} else if (settings.get(ENCODING).equals("identity")) {
|
||||
encoder = new IdentityEncoder();
|
||||
}
|
||||
} else {
|
||||
encoder = DEFAULT_ENCODER;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder);
|
||||
return filter;
|
||||
}
|
||||
|
||||
}
|
|
@ -19,6 +19,8 @@
|
|||
|
||||
package org.elasticsearch.termvectors;
|
||||
|
||||
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -36,6 +38,10 @@ import org.hamcrest.Matchers;
|
|||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
@ -355,8 +361,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
|
|||
createIndexBasedOnFieldSettings(testFieldSettings, -1);
|
||||
TestDoc[] testDocs = generateTestDocs(5, testFieldSettings);
|
||||
|
||||
// for (int i=0;i<testDocs.length;i++)
|
||||
// logger.info("Doc: {}",testDocs[i]);
|
||||
DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
|
||||
TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
|
||||
|
||||
|
@ -376,4 +380,168 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws ElasticSearchException, IOException {
|
||||
|
||||
//create the test document
|
||||
int encoding = randomIntBetween(0, 2);
|
||||
String encodingString = "";
|
||||
if (encoding == 0) {
|
||||
encodingString = "float";
|
||||
}
|
||||
if (encoding == 1) {
|
||||
encodingString = "int";
|
||||
}
|
||||
if (encoding == 2) {
|
||||
encodingString = "identity";
|
||||
}
|
||||
String[] tokens = crateRandomTokens();
|
||||
Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
|
||||
String delimiter = createRandomDelimiter(tokens);
|
||||
String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
|
||||
//create the mapping
|
||||
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties")
|
||||
.startObject("field").field("type", "string").field("term_vector", "with_positions_offsets_payloads")
|
||||
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
|
||||
ElasticsearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(
|
||||
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
|
||||
.put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
|
||||
.put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
|
||||
.put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
|
||||
ensureYellow();
|
||||
|
||||
client().prepareIndex("test", "type1", Integer.toString(1))
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
|
||||
refresh();
|
||||
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true)
|
||||
.setPositions(true).setSelectedFields();
|
||||
TermVectorResponse response = resp.execute().actionGet();
|
||||
assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
|
||||
Fields fields = response.getFields();
|
||||
assertThat(fields.size(), equalTo(1));
|
||||
Terms terms = fields.terms("field");
|
||||
TermsEnum iterator = terms.iterator(null);
|
||||
while (iterator.next() != null) {
|
||||
String term = iterator.term().utf8ToString();
|
||||
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
|
||||
assertThat(docsAndPositions.nextDoc(), equalTo(0));
|
||||
List<BytesRef> curPayloads = payloads.get(term);
|
||||
assertThat(term, curPayloads, Matchers.notNullValue());
|
||||
assert docsAndPositions != null;
|
||||
for (int k = 0; k < docsAndPositions.freq(); k++) {
|
||||
docsAndPositions.nextPosition();
|
||||
if (docsAndPositions.getPayload()!=null){
|
||||
String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString();
|
||||
assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
|
||||
} else {
|
||||
String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
|
||||
assertThat(infoString, curPayloads.get(k).length, equalTo(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
assertThat(iterator.next(), Matchers.nullValue());
|
||||
}
|
||||
private String createRandomDelimiter(String[] tokens) {
|
||||
String delimiter = "";
|
||||
boolean isTokenOrWhitespace = true;
|
||||
while(isTokenOrWhitespace) {
|
||||
isTokenOrWhitespace = false;
|
||||
delimiter = randomUnicodeOfLength(1);
|
||||
for(String token:tokens) {
|
||||
if(token.contains(delimiter)) {
|
||||
isTokenOrWhitespace = true;
|
||||
}
|
||||
}
|
||||
if(Character.isWhitespace(delimiter.charAt(0))) {
|
||||
isTokenOrWhitespace = true;
|
||||
}
|
||||
}
|
||||
return delimiter;
|
||||
}
|
||||
private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
|
||||
String resultString = "";
|
||||
ObjectIntOpenHashMap<String> payloadCounter = new ObjectIntOpenHashMap<String>();
|
||||
for (String token : tokens) {
|
||||
if (!payloadCounter.containsKey(token)) {
|
||||
payloadCounter.putIfAbsent(token, 0);
|
||||
} else {
|
||||
payloadCounter.put(token, payloadCounter.get(token) + 1);
|
||||
}
|
||||
resultString = resultString + token;
|
||||
BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
|
||||
if (payload.length > 0) {
|
||||
resultString = resultString + delimiter;
|
||||
switch (encoding) {
|
||||
case 0: {
|
||||
resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
resultString = resultString + payload.utf8ToString();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
throw new ElasticSearchException("unsupported encoding type");
|
||||
}
|
||||
}
|
||||
}
|
||||
resultString = resultString + " ";
|
||||
}
|
||||
return resultString;
|
||||
}
|
||||
|
||||
private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
|
||||
Map<String, List<BytesRef>> payloads = new HashMap<String, List<BytesRef>>();
|
||||
for (String token : tokens) {
|
||||
if (payloads.get(token) == null) {
|
||||
payloads.put(token, new ArrayList<BytesRef>());
|
||||
}
|
||||
boolean createPayload = randomBoolean();
|
||||
if (createPayload) {
|
||||
switch (encoding) {
|
||||
case 0: {
|
||||
float theFloat = randomFloat();
|
||||
payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
String payload = randomUnicodeOfLengthBetween(50, 100);
|
||||
for (int c = 0; c < payload.length(); c++) {
|
||||
if (Character.isWhitespace(payload.charAt(c))) {
|
||||
payload = payload.replace(payload.charAt(c), 'w');
|
||||
}
|
||||
}
|
||||
payloads.get(token).add(new BytesRef(payload));
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
throw new ElasticSearchException("unsupported encoding type");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
payloads.get(token).add(new BytesRef());
|
||||
}
|
||||
}
|
||||
return payloads;
|
||||
}
|
||||
|
||||
private String[] crateRandomTokens() {
|
||||
String[] tokens = { "the", "quick", "brown", "fox" };
|
||||
int numTokensWithDuplicates = randomIntBetween(3, 15);
|
||||
String[] finalTokens = new String[numTokensWithDuplicates];
|
||||
for (int i = 0; i < numTokensWithDuplicates; i++) {
|
||||
finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
|
||||
}
|
||||
return finalTokens;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue