Lazily fill CharTermAttribute if needed in CompletionTokenStream
This adds a delegate to CharTermAttributeImpl to be compatible with the Percolator that needs a CharTermAttribute. Yet compared to CharTermAttributImpl we only fill the BytesRef with UTF-8 since we already have it and only if we need to convert to UTF-16 we do it. Closes #4028
This commit is contained in:
parent
f7d5d1e5c9
commit
77239a76f8
|
@ -19,12 +19,8 @@
|
|||
package org.elasticsearch.search.suggest.completion;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -36,9 +32,10 @@ import java.util.Set;
|
|||
*/
|
||||
public final class CompletionTokenStream extends TokenStream {
|
||||
|
||||
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);;
|
||||
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
|
||||
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
|
||||
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);;
|
||||
|
||||
|
||||
private final TokenStream input;
|
||||
private BytesRef payload;
|
||||
|
@ -46,9 +43,11 @@ public final class CompletionTokenStream extends TokenStream {
|
|||
private ToFiniteStrings toFiniteStrings;
|
||||
private int posInc = -1;
|
||||
private static final int MAX_PATHS = 256;
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
private CharTermAttribute charTermAttribute;
|
||||
|
||||
public CompletionTokenStream(TokenStream input, BytesRef payload, ToFiniteStrings toFiniteStrings) throws IOException {
|
||||
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
|
||||
// the input stream entirely in toFiniteStrings(input)
|
||||
this.input = input;
|
||||
this.payload = payload;
|
||||
this.toFiniteStrings = toFiniteStrings;
|
||||
|
@ -74,8 +73,11 @@ public final class CompletionTokenStream extends TokenStream {
|
|||
* produced. Multi Fields have the same surface form and therefore sum up
|
||||
*/
|
||||
posInc = 0;
|
||||
Util.toBytesRef(finiteStrings.next(), scratch); // now we have UTF-8
|
||||
bytesAtt.setBytesRef(scratch);
|
||||
Util.toBytesRef(finiteStrings.next(), bytesAtt.getBytesRef()); // now we have UTF-8
|
||||
if (charTermAttribute != null) {
|
||||
charTermAttribute.setLength(0);
|
||||
charTermAttribute.append(bytesAtt.toUTF16());
|
||||
}
|
||||
if (payload != null) {
|
||||
payloadAttr.setPayload(this.payload);
|
||||
}
|
||||
|
@ -107,16 +109,23 @@ public final class CompletionTokenStream extends TokenStream {
|
|||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
if (hasAttribute(CharTermAttribute.class)) {
|
||||
// we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
|
||||
charTermAttribute = getAttribute(CharTermAttribute.class);
|
||||
}
|
||||
finiteStrings = null;
|
||||
posInc = -1;
|
||||
}
|
||||
|
||||
public interface ByteTermAttribute extends TermToBytesRefAttribute {
|
||||
public void setBytesRef(BytesRef bytes);
|
||||
// marker interface
|
||||
|
||||
public CharSequence toUTF16();
|
||||
}
|
||||
|
||||
public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
|
||||
private BytesRef bytes;
|
||||
private final BytesRef bytes = new BytesRef();
|
||||
private CharsRef charsRef;
|
||||
|
||||
@Override
|
||||
public int fillBytesRef() {
|
||||
|
@ -128,19 +137,24 @@ public final class CompletionTokenStream extends TokenStream {
|
|||
return bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setBytesRef(BytesRef bytes) {
|
||||
this.bytes = bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
bytes.length = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
|
||||
other.bytes = bytes;
|
||||
other.bytes.copyBytes(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence toUTF16() {
|
||||
if (charsRef == null) {
|
||||
charsRef = new CharsRef();
|
||||
}
|
||||
UnicodeUtil.UTF8toUTF16(bytes, charsRef);
|
||||
return charsRef;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -28,14 +28,17 @@ import org.elasticsearch.action.admin.indices.segments.IndexShardSegments;
|
|||
import org.elasticsearch.action.admin.indices.segments.ShardSegments;
|
||||
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
|
||||
import org.elasticsearch.action.index.IndexRequestBuilder;
|
||||
import org.elasticsearch.action.percolate.PercolateResponse;
|
||||
import org.elasticsearch.action.search.SearchPhaseExecutionException;
|
||||
import org.elasticsearch.action.suggest.SuggestResponse;
|
||||
import org.elasticsearch.client.Requests;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.mapper.MapperException;
|
||||
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
|
||||
import org.elasticsearch.percolator.PercolatorService;
|
||||
import org.elasticsearch.search.sort.FieldSortBuilder;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionStats;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
|
||||
|
@ -56,6 +59,7 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF
|
|||
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
|
||||
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
|
||||
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
||||
|
@ -89,6 +93,36 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||
assertSuggestionsNotInOrder("t", "The Prodigy", "Turbonegro", "Turbonegro Get it on", "The Prodigy Firestarter");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuggestFieldWithPercolateApi() throws Exception {
|
||||
createIndexAndMapping();
|
||||
String[][] input = {{"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"},
|
||||
{"Generator", "Foo Fighters Generator"}, {"Learn to Fly", "Foo Fighters Learn to Fly"},
|
||||
{"The Prodigy"}, {"The Prodigy"}, {"The Prodigy"}, {"Firestarter", "The Prodigy Firestarter"},
|
||||
{"Turbonegro"}, {"Turbonegro"}, {"Get it on", "Turbonegro Get it on"}}; // work with frequencies
|
||||
for (int i = 0; i < input.length; i++) {
|
||||
client().prepareIndex(INDEX, TYPE, "" + i)
|
||||
.setSource(jsonBuilder()
|
||||
.startObject().startObject(FIELD)
|
||||
.startArray("input").value(input[i]).endArray()
|
||||
.endObject()
|
||||
.endObject()
|
||||
)
|
||||
.execute().actionGet();
|
||||
}
|
||||
|
||||
client().prepareIndex(INDEX, PercolatorService.TYPE_NAME, "4")
|
||||
.setSource(jsonBuilder().startObject().field("query", matchAllQuery()).endObject())
|
||||
.execute().actionGet();
|
||||
|
||||
refresh();
|
||||
|
||||
PercolateResponse response = client().preparePercolate().setIndices(INDEX).setDocumentType(TYPE)
|
||||
.setGetRequest(Requests.getRequest(INDEX).type(TYPE).id("1"))
|
||||
.execute().actionGet();
|
||||
assertThat(response.getCount(), equalTo(1l));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicPrefixSuggestion() throws Exception {
|
||||
createIndexAndMapping();
|
||||
|
|
|
@ -25,10 +25,7 @@ import org.apache.lucene.analysis.core.SimpleAnalyzer;
|
|||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap.Builder;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
@ -42,6 +39,8 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase {
|
||||
|
||||
final XAnalyzingSuggester suggester = new XAnalyzingSuggester(new SimpleAnalyzer(TEST_VERSION_CURRENT));
|
||||
|
@ -143,12 +142,36 @@ public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase
|
|||
suggestTokenStream.close();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception {
|
||||
TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true);
|
||||
BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
|
||||
TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() {
|
||||
@Override
|
||||
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
|
||||
return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream);
|
||||
}
|
||||
}));
|
||||
TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef ref = termAtt.getBytesRef();
|
||||
assertNotNull(ref);
|
||||
suggestTokenStream.reset();
|
||||
|
||||
while (suggestTokenStream.incrementToken()) {
|
||||
termAtt.fillBytesRef();
|
||||
assertThat(ref.utf8ToString(), equalTo("mykeyword"));
|
||||
}
|
||||
suggestTokenStream.end();
|
||||
suggestTokenStream.close();
|
||||
}
|
||||
|
||||
|
||||
public final static class ByteTermAttrToCharTermAttrFilter extends TokenFilter {
|
||||
private CharTermAttribute attr = addAttribute(CharTermAttribute.class);
|
||||
private ByteTermAttribute byteAttr = addAttribute(ByteTermAttribute.class);
|
||||
private PayloadAttribute payload = addAttribute(PayloadAttribute.class);
|
||||
private TypeAttribute type = addAttribute(TypeAttribute.class);
|
||||
private CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
|
||||
protected ByteTermAttrToCharTermAttrFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
@ -157,13 +180,12 @@ public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase
|
|||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
BytesRef bytesRef = byteAttr.getBytesRef();
|
||||
attr.append(bytesRef.utf8ToString());
|
||||
// we move them over so we can assert them more easily in the tests
|
||||
type.setType(payload.getPayload().utf8ToString());
|
||||
type.setType(payload.getPayload().utf8ToString());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue