Analyze API Position Length Support (#22574)

Expose the position length attribute if a token has a non-standard position length greater than 1.
This commit is contained in:
Matt Weber 2017-01-13 06:12:49 -08:00 committed by Michael McCandless
parent fef1407fd2
commit beceb4bf8a
4 changed files with 105 additions and 9 deletions

View File

@ -113,7 +113,7 @@ public class AnalyzeRequestBuilder extends SingleShardOperationRequestBuilder<An
/**
* Sets attributes that will include results
*/
public AnalyzeRequestBuilder setAttributes(String attributes){
public AnalyzeRequestBuilder setAttributes(String... attributes){
request.attributes(attributes);
return this;
}

View File

@ -39,18 +39,20 @@ public class AnalyzeResponse extends ActionResponse implements Iterable<AnalyzeR
private int startOffset;
private int endOffset;
private int position;
private int positionLength = 1;
private Map<String, Object> attributes;
private String type;
AnalyzeToken() {
}
public AnalyzeToken(String term, int position, int startOffset, int endOffset, String type,
Map<String, Object> attributes) {
public AnalyzeToken(String term, int position, int startOffset, int endOffset, int positionLength,
String type, Map<String, Object> attributes) {
this.term = term;
this.position = position;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.positionLength = positionLength;
this.type = type;
this.attributes = attributes;
}
@ -71,6 +73,10 @@ public class AnalyzeResponse extends ActionResponse implements Iterable<AnalyzeR
return this.position;
}
public int getPositionLength() {
return this.positionLength;
}
public String getType() {
return this.type;
}
@ -87,6 +93,9 @@ public class AnalyzeResponse extends ActionResponse implements Iterable<AnalyzeR
builder.field(Fields.END_OFFSET, endOffset);
builder.field(Fields.TYPE, type);
builder.field(Fields.POSITION, position);
if (positionLength > 1) {
builder.field(Fields.POSITION_LENGTH, positionLength);
}
if (attributes != null && !attributes.isEmpty()) {
for (Map.Entry<String, Object> entity : attributes.entrySet()) {
builder.field(entity.getKey(), entity.getValue());
@ -108,6 +117,14 @@ public class AnalyzeResponse extends ActionResponse implements Iterable<AnalyzeR
startOffset = in.readInt();
endOffset = in.readInt();
position = in.readVInt();
if (in.getVersion().onOrAfter(Version.V_5_2_0_UNRELEASED)) {
Integer len = in.readOptionalVInt();
if (len != null) {
positionLength = len;
} else {
positionLength = 1;
}
}
type = in.readOptionalString();
if (in.getVersion().onOrAfter(Version.V_2_2_0)) {
attributes = (Map<String, Object>) in.readGenericValue();
@ -120,6 +137,9 @@ public class AnalyzeResponse extends ActionResponse implements Iterable<AnalyzeR
out.writeInt(startOffset);
out.writeInt(endOffset);
out.writeVInt(position);
if (out.getVersion().onOrAfter(Version.V_5_2_0_UNRELEASED)) {
out.writeOptionalVInt(positionLength > 1 ? positionLength : null);
}
out.writeOptionalString(type);
if (out.getVersion().onOrAfter(Version.V_2_2_0)) {
out.writeGenericValue(attributes);
@ -208,6 +228,7 @@ public class AnalyzeResponse extends ActionResponse implements Iterable<AnalyzeR
static final String END_OFFSET = "end_offset";
static final String TYPE = "type";
static final String POSITION = "position";
static final String POSITION_LENGTH = "positionLength";
static final String DETAIL = "detail";
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -218,13 +219,15 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null));
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
}
stream.end();
@ -381,6 +384,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
@ -388,7 +392,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
lastOffset + offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes)));
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), extractExtendedAttributes(stream, includeAttributes)));
}
stream.end();

View File

@ -50,21 +50,25 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getStartOffset(), equalTo(0));
assertThat(token.getEndOffset(), equalTo(4));
assertThat(token.getPosition(), equalTo(0));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("is"));
assertThat(token.getStartOffset(), equalTo(5));
assertThat(token.getEndOffset(), equalTo(7));
assertThat(token.getPosition(), equalTo(1));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("a"));
assertThat(token.getStartOffset(), equalTo(8));
assertThat(token.getEndOffset(), equalTo(9));
assertThat(token.getPosition(), equalTo(2));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("test"));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getPositionLength(), equalTo(1));
}
}
@ -104,7 +108,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.getTokens().get(0).getPosition(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getStartOffset(), equalTo(3));
assertThat(analyzeResponse.getTokens().get(0).getEndOffset(), equalTo(9));
assertThat(analyzeResponse.getTokens().get(0).getPositionLength(), equalTo(1));
}
public void testAnalyzeWithCharFilters() throws Exception {
@ -137,6 +141,54 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getTerm(), equalTo("fish"));
}
public void testAnalyzeWithNonDefaultPostionLength() throws Exception {
assertAcked(prepareCreate("test").addAlias(new Alias("alias"))
.setSettings(Settings.builder().put(indexSettings())
.put("index.analysis.filter.syns.type", "synonym")
.putArray("index.analysis.filter.syns.synonyms", "wtf, what the fudge")
.put("index.analysis.analyzer.custom_syns.tokenizer", "standard")
.putArray("index.analysis.analyzer.custom_syns.filter", "lowercase", "syns")));
ensureGreen();
AnalyzeResponse analyzeResponse = client().admin().indices().prepareAnalyze("say what the fudge").setIndex("test").setAnalyzer("custom_syns").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(5));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("say"));
assertThat(token.getPosition(), equalTo(0));
assertThat(token.getStartOffset(), equalTo(0));
assertThat(token.getEndOffset(), equalTo(3));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("what"));
assertThat(token.getPosition(), equalTo(1));
assertThat(token.getStartOffset(), equalTo(4));
assertThat(token.getEndOffset(), equalTo(8));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("wtf"));
assertThat(token.getPosition(), equalTo(1));
assertThat(token.getStartOffset(), equalTo(4));
assertThat(token.getEndOffset(), equalTo(18));
assertThat(token.getPositionLength(), equalTo(3));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("the"));
assertThat(token.getPosition(), equalTo(2));
assertThat(token.getStartOffset(), equalTo(9));
assertThat(token.getEndOffset(), equalTo(12));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(4);
assertThat(token.getTerm(), equalTo("fudge"));
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(13));
assertThat(token.getEndOffset(), equalTo(18));
assertThat(token.getPositionLength(), equalTo(1));
}
public void testAnalyzerWithFieldOrTypeTests() throws Exception {
assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
ensureGreen();
@ -154,6 +206,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getTerm(), equalTo("test"));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));
assertThat(token.getPositionLength(), equalTo(1));
}
}
@ -200,13 +253,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.getTokens().get(5);
assertThat(token.getTerm(), equalTo("second"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(19));
assertThat(token.getEndOffset(), equalTo(25));
assertThat(token.getPositionLength(), equalTo(1));
}
public void testDetailAnalyze() throws Exception {
@ -350,12 +404,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.detail().analyzer().getTokens()[5];
assertThat(token.getTerm(), equalTo("second"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(19));
assertThat(token.getEndOffset(), equalTo(25));
assertThat(token.getPositionLength(), equalTo(1));
}
public void testDetailAnalyzeWithMultiValuesWithCustomAnalyzer() throws Exception {
@ -395,12 +451,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(15));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.detail().tokenizer().getTokens()[5];
assertThat(token.getTerm(), equalTo("troubled"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(20));
assertThat(token.getEndOffset(), equalTo(28));
assertThat(token.getPositionLength(), equalTo(1));
// tokenfilter(snowball)
assertThat(analyzeResponse.detail().tokenfilters().length, equalTo(2));
@ -412,12 +470,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(15));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.detail().tokenfilters()[0].getTokens()[5];
assertThat(token.getTerm(), equalTo("troubl"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(20));
assertThat(token.getEndOffset(), equalTo(28));
assertThat(token.getPositionLength(), equalTo(1));
// tokenfilter(lowercase)
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("lowercase"));
@ -428,14 +488,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(15));
assertThat(token.getPositionLength(), equalTo(1));
token = analyzeResponse.detail().tokenfilters()[0].getTokens()[5];
assertThat(token.getTerm(), equalTo("troubl"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(20));
assertThat(token.getEndOffset(), equalTo(28));
assertThat(token.getPositionLength(), equalTo(1));
}
public void testNonExistTokenizer() {
@ -468,16 +528,19 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getEndOffset(), equalTo(3));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPosition(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getTerm(), equalTo("buzz"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getStartOffset(), equalTo(4));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getEndOffset(), equalTo(8));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPosition(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getTerm(), equalTo("test"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getStartOffset(), equalTo(9));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getEndOffset(), equalTo(13));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPosition(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPositionLength(), equalTo(1));
// tokenfilter(lowercase)
assertThat(analyzeResponse.detail().tokenfilters().length, equalTo(2));
@ -487,16 +550,19 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getStartOffset(), equalTo(0));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getEndOffset(), equalTo(3));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getPosition(), equalTo(0));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[1].getTerm(), equalTo("buzz"));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[1].getStartOffset(), equalTo(4));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[1].getEndOffset(), equalTo(8));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[1].getPosition(), equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[1].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getTerm(), equalTo("test"));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getStartOffset(), equalTo(9));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getEndOffset(), equalTo(13));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPosition(), equalTo(2));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPositionLength(), equalTo(1));
// tokenfilter({"type": "stop", "stopwords": ["foo", "buzz"]})
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter_[1]"));
@ -506,6 +572,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getStartOffset(), equalTo(9));
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getEndOffset(), equalTo(13));
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getPosition(), equalTo(2));
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getPositionLength(), equalTo(1));
}
@ -533,6 +600,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("jeff qit fish"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getEndOffset(), equalTo(15));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPositionLength(), equalTo(1));
}
@ -556,16 +624,19 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getEndOffset(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPosition(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getTerm(), equalTo("oo"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getStartOffset(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getEndOffset(), equalTo(3));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPosition(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getTerm(), equalTo("od"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getStartOffset(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getEndOffset(), equalTo(4));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPosition(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPositionLength(), equalTo(1));
}
}