Highlighting: Add boundary_chars and boundary_max_size to control text boundaries with fast vector highlighter (term vector), closes #1614.
This commit is contained in:
parent
9bdef666cb
commit
21405f5aa4
|
@ -0,0 +1,62 @@
|
||||||
|
package org.elasticsearch.common.lucene.search.vectorhighlight;
|
||||||
|
|
||||||
|
import gnu.trove.set.hash.TCharHashSet;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}.
|
||||||
|
* <p/>
|
||||||
|
* Uses specialized char set to lookup boundary, and fixes a problem with start offset in the
|
||||||
|
* beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem
|
||||||
|
* with multiple empty fields to highlight...).
|
||||||
|
*/
|
||||||
|
public class SimpleBoundaryScanner2 implements BoundaryScanner {
|
||||||
|
|
||||||
|
public static final int DEFAULT_MAX_SCAN = 20;
|
||||||
|
public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};
|
||||||
|
|
||||||
|
public static final SimpleBoundaryScanner2 DEFAULT = new SimpleBoundaryScanner2();
|
||||||
|
|
||||||
|
public int maxScan;
|
||||||
|
public TCharHashSet boundaryChars;
|
||||||
|
|
||||||
|
public SimpleBoundaryScanner2() {
|
||||||
|
this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SimpleBoundaryScanner2(int maxScan, char[] boundaryChars) {
|
||||||
|
this.maxScan = maxScan;
|
||||||
|
this.boundaryChars = new TCharHashSet(boundaryChars);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int findStartOffset(StringBuilder buffer, int start) {
|
||||||
|
// avoid illegal start offset
|
||||||
|
if (start > buffer.length() || start < 1) return start;
|
||||||
|
int offset, count = maxScan;
|
||||||
|
for (offset = start; offset > 0 && count > 0; count--) {
|
||||||
|
// found?
|
||||||
|
if (boundaryChars.contains(buffer.charAt(offset - 1))) return offset;
|
||||||
|
offset--;
|
||||||
|
}
|
||||||
|
// LUCENE-3697
|
||||||
|
if (offset == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// not found
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int findEndOffset(StringBuilder buffer, int start) {
|
||||||
|
// avoid illegal start offset
|
||||||
|
if (start > buffer.length() || start < 0) return start;
|
||||||
|
int offset, count = maxScan;
|
||||||
|
//for( offset = start; offset <= buffer.length() && count > 0; count-- ){
|
||||||
|
for (offset = start; offset < buffer.length() && count > 0; count--) {
|
||||||
|
// found?
|
||||||
|
if (boundaryChars.contains(buffer.charAt(offset))) return offset;
|
||||||
|
offset++;
|
||||||
|
}
|
||||||
|
// not found
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
}
|
|
@ -36,6 +36,7 @@ import org.elasticsearch.common.io.FastStringReader;
|
||||||
import org.elasticsearch.common.lucene.document.SingleFieldSelector;
|
import org.elasticsearch.common.lucene.document.SingleFieldSelector;
|
||||||
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
|
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
|
||||||
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
||||||
|
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
|
||||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||||
import org.elasticsearch.index.mapper.FieldMapper;
|
import org.elasticsearch.index.mapper.FieldMapper;
|
||||||
import org.elasticsearch.index.mapper.MapperService;
|
import org.elasticsearch.index.mapper.MapperService;
|
||||||
|
@ -237,13 +238,19 @@ public class HighlightPhase implements FetchSubPhase {
|
||||||
if (entry == null) {
|
if (entry == null) {
|
||||||
FragListBuilder fragListBuilder;
|
FragListBuilder fragListBuilder;
|
||||||
FragmentsBuilder fragmentsBuilder;
|
FragmentsBuilder fragmentsBuilder;
|
||||||
|
|
||||||
|
BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT;
|
||||||
|
if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) {
|
||||||
|
boundaryScanner = new SimpleBoundaryScanner2(field.boundaryMaxScan(), field.boundaryChars());
|
||||||
|
}
|
||||||
|
|
||||||
if (field.numberOfFragments() == 0) {
|
if (field.numberOfFragments() == 0) {
|
||||||
fragListBuilder = new SingleFragListBuilder();
|
fragListBuilder = new SingleFragListBuilder();
|
||||||
|
|
||||||
if (mapper.stored()) {
|
if (mapper.stored()) {
|
||||||
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags());
|
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
|
||||||
} else {
|
} else {
|
||||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
|
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (field.fragmentOffset() == -1)
|
if (field.fragmentOffset() == -1)
|
||||||
|
@ -253,15 +260,15 @@ public class HighlightPhase implements FetchSubPhase {
|
||||||
|
|
||||||
if (field.scoreOrdered()) {
|
if (field.scoreOrdered()) {
|
||||||
if (mapper.stored()) {
|
if (mapper.stored()) {
|
||||||
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags());
|
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
|
||||||
} else {
|
} else {
|
||||||
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
|
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (mapper.stored()) {
|
if (mapper.stored()) {
|
||||||
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags());
|
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
|
||||||
} else {
|
} else {
|
||||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
|
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
package org.elasticsearch.search.highlight;
|
package org.elasticsearch.search.highlight;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
import org.elasticsearch.search.SearchParseElement;
|
import org.elasticsearch.search.SearchParseElement;
|
||||||
import org.elasticsearch.search.SearchParseException;
|
import org.elasticsearch.search.SearchParseException;
|
||||||
|
@ -73,6 +74,8 @@ public class HighlighterParseElement implements SearchParseElement {
|
||||||
int globalFragmentSize = 100;
|
int globalFragmentSize = 100;
|
||||||
int globalNumOfFragments = 5;
|
int globalNumOfFragments = 5;
|
||||||
String globalEncoder = "default";
|
String globalEncoder = "default";
|
||||||
|
int globalBoundaryMaxScan = SimpleBoundaryScanner2.DEFAULT_MAX_SCAN;
|
||||||
|
char[] globalBoundaryChars = SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS;
|
||||||
|
|
||||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||||
if (token == XContentParser.Token.FIELD_NAME) {
|
if (token == XContentParser.Token.FIELD_NAME) {
|
||||||
|
@ -110,6 +113,10 @@ public class HighlighterParseElement implements SearchParseElement {
|
||||||
globalEncoder = parser.text();
|
globalEncoder = parser.text();
|
||||||
} else if ("require_field_match".equals(topLevelFieldName) || "requireFieldMatch".equals(topLevelFieldName)) {
|
} else if ("require_field_match".equals(topLevelFieldName) || "requireFieldMatch".equals(topLevelFieldName)) {
|
||||||
globalRequireFieldMatch = parser.booleanValue();
|
globalRequireFieldMatch = parser.booleanValue();
|
||||||
|
} else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) {
|
||||||
|
globalBoundaryMaxScan = parser.intValue();
|
||||||
|
} else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) {
|
||||||
|
globalBoundaryChars = parser.text().toCharArray();
|
||||||
}
|
}
|
||||||
} else if (token == XContentParser.Token.START_OBJECT) {
|
} else if (token == XContentParser.Token.START_OBJECT) {
|
||||||
if ("fields".equals(topLevelFieldName)) {
|
if ("fields".equals(topLevelFieldName)) {
|
||||||
|
@ -150,6 +157,10 @@ public class HighlighterParseElement implements SearchParseElement {
|
||||||
field.scoreOrdered("score".equals(parser.text()));
|
field.scoreOrdered("score".equals(parser.text()));
|
||||||
} else if ("require_field_match".equals(fieldName) || "requireFieldMatch".equals(fieldName)) {
|
} else if ("require_field_match".equals(fieldName) || "requireFieldMatch".equals(fieldName)) {
|
||||||
field.requireFieldMatch(parser.booleanValue());
|
field.requireFieldMatch(parser.booleanValue());
|
||||||
|
} else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) {
|
||||||
|
field.boundaryMaxScan(parser.intValue());
|
||||||
|
} else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) {
|
||||||
|
field.boundaryChars(parser.text().toCharArray());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -189,6 +200,12 @@ public class HighlighterParseElement implements SearchParseElement {
|
||||||
if (field.requireFieldMatch() == null) {
|
if (field.requireFieldMatch() == null) {
|
||||||
field.requireFieldMatch(globalRequireFieldMatch);
|
field.requireFieldMatch(globalRequireFieldMatch);
|
||||||
}
|
}
|
||||||
|
if (field.boundaryMaxScan() == -1) {
|
||||||
|
field.boundaryMaxScan(globalBoundaryMaxScan);
|
||||||
|
}
|
||||||
|
if (field.boundaryChars() == null) {
|
||||||
|
field.boundaryChars(globalBoundaryChars);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
context.highlight(new SearchContextHighlight(fields));
|
context.highlight(new SearchContextHighlight(fields));
|
||||||
|
|
|
@ -58,6 +58,9 @@ public class SearchContextHighlight {
|
||||||
|
|
||||||
private Boolean requireFieldMatch;
|
private Boolean requireFieldMatch;
|
||||||
|
|
||||||
|
private int boundaryMaxScan = -1;
|
||||||
|
private char[] boundaryChars = null;
|
||||||
|
|
||||||
public Field(String field) {
|
public Field(String field) {
|
||||||
this.field = field;
|
this.field = field;
|
||||||
}
|
}
|
||||||
|
@ -137,5 +140,21 @@ public class SearchContextHighlight {
|
||||||
public void requireFieldMatch(boolean requireFieldMatch) {
|
public void requireFieldMatch(boolean requireFieldMatch) {
|
||||||
this.requireFieldMatch = requireFieldMatch;
|
this.requireFieldMatch = requireFieldMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int boundaryMaxScan() {
|
||||||
|
return boundaryMaxScan;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void boundaryMaxScan(int boundaryMaxScan) {
|
||||||
|
this.boundaryMaxScan = boundaryMaxScan;
|
||||||
|
}
|
||||||
|
|
||||||
|
public char[] boundaryChars() {
|
||||||
|
return boundaryChars;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void boundaryChars(char[] boundaryChars) {
|
||||||
|
this.boundaryChars = boundaryChars;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight;
|
||||||
|
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||||
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
|
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
|
||||||
import org.elasticsearch.index.mapper.FieldMapper;
|
import org.elasticsearch.index.mapper.FieldMapper;
|
||||||
import org.elasticsearch.search.internal.SearchContext;
|
import org.elasticsearch.search.internal.SearchContext;
|
||||||
|
@ -39,8 +40,8 @@ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder
|
||||||
private final SearchContext searchContext;
|
private final SearchContext searchContext;
|
||||||
|
|
||||||
public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
|
public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
|
||||||
String[] preTags, String[] postTags) {
|
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
|
||||||
super(preTags, postTags);
|
super(preTags, postTags, boundaryScanner);
|
||||||
this.mapper = mapper;
|
this.mapper = mapper;
|
||||||
this.searchContext = searchContext;
|
this.searchContext = searchContext;
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight;
|
||||||
|
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||||
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
|
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
|
||||||
import org.elasticsearch.index.mapper.FieldMapper;
|
import org.elasticsearch.index.mapper.FieldMapper;
|
||||||
import org.elasticsearch.search.internal.SearchContext;
|
import org.elasticsearch.search.internal.SearchContext;
|
||||||
|
@ -39,8 +40,8 @@ public class SourceSimpleFragmentsBuilder extends SimpleFragmentsBuilder {
|
||||||
private final SearchContext searchContext;
|
private final SearchContext searchContext;
|
||||||
|
|
||||||
public SourceSimpleFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
|
public SourceSimpleFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
|
||||||
String[] preTags, String[] postTags) {
|
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
|
||||||
super(preTags, postTags);
|
super(preTags, postTags, boundaryScanner);
|
||||||
this.mapper = mapper;
|
this.mapper = mapper;
|
||||||
this.searchContext = searchContext;
|
this.searchContext = searchContext;
|
||||||
}
|
}
|
||||||
|
|
|
@ -555,7 +555,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
||||||
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
||||||
|
|
||||||
SearchHit hit = search.hits().getAt(0);
|
SearchHit hit = search.hits().getAt(0);
|
||||||
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a <em>test</em> "));
|
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a <em>test</em> "));
|
||||||
|
|
||||||
// search on title.key and highlight on title
|
// search on title.key and highlight on title
|
||||||
search = client.prepareSearch()
|
search = client.prepareSearch()
|
||||||
|
@ -596,7 +596,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
||||||
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
||||||
|
|
||||||
SearchHit hit = search.hits().getAt(0);
|
SearchHit hit = search.hits().getAt(0);
|
||||||
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a <em>test</em> "));
|
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a <em>test</em> "));
|
||||||
|
|
||||||
// search on title.key and highlight on title.key
|
// search on title.key and highlight on title.key
|
||||||
search = client.prepareSearch()
|
search = client.prepareSearch()
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class VectorHighlighterTests {
|
||||||
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
|
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
|
||||||
reader, topDocs.scoreDocs[0].doc, "content", 30);
|
reader, topDocs.scoreDocs[0].doc, "content", 30);
|
||||||
assertThat(fragment, notNullValue());
|
assertThat(fragment, notNullValue());
|
||||||
System.out.println(fragment);
|
assertThat(fragment, equalTo("e big <b>bad</b> dog "));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue