Highlighting: Add boundary_chars and boundary_max_size to control text boundaries with fast vector highlighter (term vector), closes #1614.
This commit is contained in:
parent
9bdef666cb
commit
21405f5aa4
|
@ -0,0 +1,62 @@
|
|||
package org.elasticsearch.common.lucene.search.vectorhighlight;
|
||||
|
||||
import gnu.trove.set.hash.TCharHashSet;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
|
||||
/**
|
||||
* A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}.
|
||||
* <p/>
|
||||
* Uses specialized char set to lookup boundary, and fixes a problem with start offset in the
|
||||
* beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem
|
||||
* with multiple empty fields to highlight...).
|
||||
*/
|
||||
public class SimpleBoundaryScanner2 implements BoundaryScanner {
|
||||
|
||||
public static final int DEFAULT_MAX_SCAN = 20;
|
||||
public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};
|
||||
|
||||
public static final SimpleBoundaryScanner2 DEFAULT = new SimpleBoundaryScanner2();
|
||||
|
||||
public int maxScan;
|
||||
public TCharHashSet boundaryChars;
|
||||
|
||||
public SimpleBoundaryScanner2() {
|
||||
this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS);
|
||||
}
|
||||
|
||||
public SimpleBoundaryScanner2(int maxScan, char[] boundaryChars) {
|
||||
this.maxScan = maxScan;
|
||||
this.boundaryChars = new TCharHashSet(boundaryChars);
|
||||
}
|
||||
|
||||
public int findStartOffset(StringBuilder buffer, int start) {
|
||||
// avoid illegal start offset
|
||||
if (start > buffer.length() || start < 1) return start;
|
||||
int offset, count = maxScan;
|
||||
for (offset = start; offset > 0 && count > 0; count--) {
|
||||
// found?
|
||||
if (boundaryChars.contains(buffer.charAt(offset - 1))) return offset;
|
||||
offset--;
|
||||
}
|
||||
// LUCENE-3697
|
||||
if (offset == 0) {
|
||||
return 0;
|
||||
}
|
||||
// not found
|
||||
return start;
|
||||
}
|
||||
|
||||
public int findEndOffset(StringBuilder buffer, int start) {
|
||||
// avoid illegal start offset
|
||||
if (start > buffer.length() || start < 0) return start;
|
||||
int offset, count = maxScan;
|
||||
//for( offset = start; offset <= buffer.length() && count > 0; count-- ){
|
||||
for (offset = start; offset < buffer.length() && count > 0; count--) {
|
||||
// found?
|
||||
if (boundaryChars.contains(buffer.charAt(offset))) return offset;
|
||||
offset++;
|
||||
}
|
||||
// not found
|
||||
return start;
|
||||
}
|
||||
}
|
|
@ -36,6 +36,7 @@ import org.elasticsearch.common.io.FastStringReader;
|
|||
import org.elasticsearch.common.lucene.document.SingleFieldSelector;
|
||||
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
|
||||
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
||||
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.index.mapper.MapperService;
|
||||
|
@ -237,13 +238,19 @@ public class HighlightPhase implements FetchSubPhase {
|
|||
if (entry == null) {
|
||||
FragListBuilder fragListBuilder;
|
||||
FragmentsBuilder fragmentsBuilder;
|
||||
|
||||
BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT;
|
||||
if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) {
|
||||
boundaryScanner = new SimpleBoundaryScanner2(field.boundaryMaxScan(), field.boundaryChars());
|
||||
}
|
||||
|
||||
if (field.numberOfFragments() == 0) {
|
||||
fragListBuilder = new SingleFragListBuilder();
|
||||
|
||||
if (mapper.stored()) {
|
||||
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags());
|
||||
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
|
||||
} else {
|
||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
|
||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
|
||||
}
|
||||
} else {
|
||||
if (field.fragmentOffset() == -1)
|
||||
|
@ -253,15 +260,15 @@ public class HighlightPhase implements FetchSubPhase {
|
|||
|
||||
if (field.scoreOrdered()) {
|
||||
if (mapper.stored()) {
|
||||
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags());
|
||||
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
|
||||
} else {
|
||||
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
|
||||
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
|
||||
}
|
||||
} else {
|
||||
if (mapper.stored()) {
|
||||
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags());
|
||||
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
|
||||
} else {
|
||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
|
||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.search.highlight;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.search.SearchParseElement;
|
||||
import org.elasticsearch.search.SearchParseException;
|
||||
|
@ -73,6 +74,8 @@ public class HighlighterParseElement implements SearchParseElement {
|
|||
int globalFragmentSize = 100;
|
||||
int globalNumOfFragments = 5;
|
||||
String globalEncoder = "default";
|
||||
int globalBoundaryMaxScan = SimpleBoundaryScanner2.DEFAULT_MAX_SCAN;
|
||||
char[] globalBoundaryChars = SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS;
|
||||
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
|
@ -110,6 +113,10 @@ public class HighlighterParseElement implements SearchParseElement {
|
|||
globalEncoder = parser.text();
|
||||
} else if ("require_field_match".equals(topLevelFieldName) || "requireFieldMatch".equals(topLevelFieldName)) {
|
||||
globalRequireFieldMatch = parser.booleanValue();
|
||||
} else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) {
|
||||
globalBoundaryMaxScan = parser.intValue();
|
||||
} else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) {
|
||||
globalBoundaryChars = parser.text().toCharArray();
|
||||
}
|
||||
} else if (token == XContentParser.Token.START_OBJECT) {
|
||||
if ("fields".equals(topLevelFieldName)) {
|
||||
|
@ -150,6 +157,10 @@ public class HighlighterParseElement implements SearchParseElement {
|
|||
field.scoreOrdered("score".equals(parser.text()));
|
||||
} else if ("require_field_match".equals(fieldName) || "requireFieldMatch".equals(fieldName)) {
|
||||
field.requireFieldMatch(parser.booleanValue());
|
||||
} else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) {
|
||||
field.boundaryMaxScan(parser.intValue());
|
||||
} else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) {
|
||||
field.boundaryChars(parser.text().toCharArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -189,6 +200,12 @@ public class HighlighterParseElement implements SearchParseElement {
|
|||
if (field.requireFieldMatch() == null) {
|
||||
field.requireFieldMatch(globalRequireFieldMatch);
|
||||
}
|
||||
if (field.boundaryMaxScan() == -1) {
|
||||
field.boundaryMaxScan(globalBoundaryMaxScan);
|
||||
}
|
||||
if (field.boundaryChars() == null) {
|
||||
field.boundaryChars(globalBoundaryChars);
|
||||
}
|
||||
}
|
||||
|
||||
context.highlight(new SearchContextHighlight(fields));
|
||||
|
|
|
@ -58,6 +58,9 @@ public class SearchContextHighlight {
|
|||
|
||||
private Boolean requireFieldMatch;
|
||||
|
||||
private int boundaryMaxScan = -1;
|
||||
private char[] boundaryChars = null;
|
||||
|
||||
public Field(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
@ -137,5 +140,21 @@ public class SearchContextHighlight {
|
|||
public void requireFieldMatch(boolean requireFieldMatch) {
|
||||
this.requireFieldMatch = requireFieldMatch;
|
||||
}
|
||||
|
||||
public int boundaryMaxScan() {
|
||||
return boundaryMaxScan;
|
||||
}
|
||||
|
||||
public void boundaryMaxScan(int boundaryMaxScan) {
|
||||
this.boundaryMaxScan = boundaryMaxScan;
|
||||
}
|
||||
|
||||
public char[] boundaryChars() {
|
||||
return boundaryChars;
|
||||
}
|
||||
|
||||
public void boundaryChars(char[] boundaryChars) {
|
||||
this.boundaryChars = boundaryChars;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight;
|
|||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.search.internal.SearchContext;
|
||||
|
@ -39,8 +40,8 @@ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder
|
|||
private final SearchContext searchContext;
|
||||
|
||||
public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
|
||||
String[] preTags, String[] postTags) {
|
||||
super(preTags, postTags);
|
||||
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
|
||||
super(preTags, postTags, boundaryScanner);
|
||||
this.mapper = mapper;
|
||||
this.searchContext = searchContext;
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight;
|
|||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.search.internal.SearchContext;
|
||||
|
@ -39,8 +40,8 @@ public class SourceSimpleFragmentsBuilder extends SimpleFragmentsBuilder {
|
|||
private final SearchContext searchContext;
|
||||
|
||||
public SourceSimpleFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
|
||||
String[] preTags, String[] postTags) {
|
||||
super(preTags, postTags);
|
||||
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
|
||||
super(preTags, postTags, boundaryScanner);
|
||||
this.mapper = mapper;
|
||||
this.searchContext = searchContext;
|
||||
}
|
||||
|
|
|
@ -555,7 +555,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
||||
|
||||
SearchHit hit = search.hits().getAt(0);
|
||||
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a <em>test</em> "));
|
||||
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a <em>test</em> "));
|
||||
|
||||
// search on title.key and highlight on title
|
||||
search = client.prepareSearch()
|
||||
|
@ -596,7 +596,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
||||
|
||||
SearchHit hit = search.hits().getAt(0);
|
||||
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a <em>test</em> "));
|
||||
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a <em>test</em> "));
|
||||
|
||||
// search on title.key and highlight on title.key
|
||||
search = client.prepareSearch()
|
||||
|
|
|
@ -60,7 +60,7 @@ public class VectorHighlighterTests {
|
|||
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
|
||||
reader, topDocs.scoreDocs[0].doc, "content", 30);
|
||||
assertThat(fragment, notNullValue());
|
||||
System.out.println(fragment);
|
||||
assertThat(fragment, equalTo("e big <b>bad</b> dog "));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue