Highlighting: Add boundary_chars and boundary_max_size to control text boundaries with fast vector highlighter (term vector), closes #1614.

This commit is contained in:
Shay Banon 2012-01-15 23:05:34 +02:00
parent 9bdef666cb
commit 21405f5aa4
8 changed files with 120 additions and 13 deletions

View File

@ -0,0 +1,62 @@
package org.elasticsearch.common.lucene.search.vectorhighlight;
import gnu.trove.set.hash.TCharHashSet;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
/**
* A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}.
* <p/>
* Uses specialized char set to lookup boundary, and fixes a problem with start offset in the
* beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem
* with multiple empty fields to highlight...).
*/
public class SimpleBoundaryScanner2 implements BoundaryScanner {
public static final int DEFAULT_MAX_SCAN = 20;
public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};
public static final SimpleBoundaryScanner2 DEFAULT = new SimpleBoundaryScanner2();
public int maxScan;
public TCharHashSet boundaryChars;
public SimpleBoundaryScanner2() {
this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS);
}
public SimpleBoundaryScanner2(int maxScan, char[] boundaryChars) {
this.maxScan = maxScan;
this.boundaryChars = new TCharHashSet(boundaryChars);
}
public int findStartOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if (start > buffer.length() || start < 1) return start;
int offset, count = maxScan;
for (offset = start; offset > 0 && count > 0; count--) {
// found?
if (boundaryChars.contains(buffer.charAt(offset - 1))) return offset;
offset--;
}
// LUCENE-3697
if (offset == 0) {
return 0;
}
// not found
return start;
}
public int findEndOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if (start > buffer.length() || start < 0) return start;
int offset, count = maxScan;
//for( offset = start; offset <= buffer.length() && count > 0; count-- ){
for (offset = start; offset < buffer.length() && count > 0; count--) {
// found?
if (boundaryChars.contains(buffer.charAt(offset))) return offset;
offset++;
}
// not found
return start;
}
}

View File

@ -36,6 +36,7 @@ import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.lucene.document.SingleFieldSelector;
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MapperService;
@ -237,13 +238,19 @@ public class HighlightPhase implements FetchSubPhase {
if (entry == null) {
FragListBuilder fragListBuilder;
FragmentsBuilder fragmentsBuilder;
BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT;
if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) {
boundaryScanner = new SimpleBoundaryScanner2(field.boundaryMaxScan(), field.boundaryChars());
}
if (field.numberOfFragments() == 0) {
fragListBuilder = new SingleFragListBuilder();
if (mapper.stored()) {
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags());
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
} else {
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
}
} else {
if (field.fragmentOffset() == -1)
@ -253,15 +260,15 @@ public class HighlightPhase implements FetchSubPhase {
if (field.scoreOrdered()) {
if (mapper.stored()) {
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags());
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
} else {
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
}
} else {
if (mapper.stored()) {
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags());
fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
} else {
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags());
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
}
}
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.search.highlight;
import com.google.common.collect.Lists;
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.SearchParseElement;
import org.elasticsearch.search.SearchParseException;
@ -73,6 +74,8 @@ public class HighlighterParseElement implements SearchParseElement {
int globalFragmentSize = 100;
int globalNumOfFragments = 5;
String globalEncoder = "default";
int globalBoundaryMaxScan = SimpleBoundaryScanner2.DEFAULT_MAX_SCAN;
char[] globalBoundaryChars = SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
@ -110,6 +113,10 @@ public class HighlighterParseElement implements SearchParseElement {
globalEncoder = parser.text();
} else if ("require_field_match".equals(topLevelFieldName) || "requireFieldMatch".equals(topLevelFieldName)) {
globalRequireFieldMatch = parser.booleanValue();
} else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) {
globalBoundaryMaxScan = parser.intValue();
} else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) {
globalBoundaryChars = parser.text().toCharArray();
}
} else if (token == XContentParser.Token.START_OBJECT) {
if ("fields".equals(topLevelFieldName)) {
@ -150,6 +157,10 @@ public class HighlighterParseElement implements SearchParseElement {
field.scoreOrdered("score".equals(parser.text()));
} else if ("require_field_match".equals(fieldName) || "requireFieldMatch".equals(fieldName)) {
field.requireFieldMatch(parser.booleanValue());
} else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) {
field.boundaryMaxScan(parser.intValue());
} else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) {
field.boundaryChars(parser.text().toCharArray());
}
}
}
@ -189,6 +200,12 @@ public class HighlighterParseElement implements SearchParseElement {
if (field.requireFieldMatch() == null) {
field.requireFieldMatch(globalRequireFieldMatch);
}
if (field.boundaryMaxScan() == -1) {
field.boundaryMaxScan(globalBoundaryMaxScan);
}
if (field.boundaryChars() == null) {
field.boundaryChars(globalBoundaryChars);
}
}
context.highlight(new SearchContextHighlight(fields));

View File

@ -58,6 +58,9 @@ public class SearchContextHighlight {
private Boolean requireFieldMatch;
private int boundaryMaxScan = -1;
private char[] boundaryChars = null;
public Field(String field) {
this.field = field;
}
@ -137,5 +140,21 @@ public class SearchContextHighlight {
public void requireFieldMatch(boolean requireFieldMatch) {
this.requireFieldMatch = requireFieldMatch;
}
public int boundaryMaxScan() {
return boundaryMaxScan;
}
public void boundaryMaxScan(int boundaryMaxScan) {
this.boundaryMaxScan = boundaryMaxScan;
}
public char[] boundaryChars() {
return boundaryChars;
}
public void boundaryChars(char[] boundaryChars) {
this.boundaryChars = boundaryChars;
}
}
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.internal.SearchContext;
@ -39,8 +40,8 @@ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder
private final SearchContext searchContext;
public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
String[] preTags, String[] postTags) {
super(preTags, postTags);
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
super(preTags, postTags, boundaryScanner);
this.mapper = mapper;
this.searchContext = searchContext;
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.internal.SearchContext;
@ -39,8 +40,8 @@ public class SourceSimpleFragmentsBuilder extends SimpleFragmentsBuilder {
private final SearchContext searchContext;
public SourceSimpleFragmentsBuilder(FieldMapper mapper, SearchContext searchContext,
String[] preTags, String[] postTags) {
super(preTags, postTags);
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
super(preTags, postTags, boundaryScanner);
this.mapper = mapper;
this.searchContext = searchContext;
}

View File

@ -555,7 +555,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
SearchHit hit = search.hits().getAt(0);
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a <em>test</em> "));
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a <em>test</em> "));
// search on title.key and highlight on title
search = client.prepareSearch()
@ -596,7 +596,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
SearchHit hit = search.hits().getAt(0);
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a <em>test</em> "));
assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a <em>test</em> "));
// search on title.key and highlight on title.key
search = client.prepareSearch()

View File

@ -60,7 +60,7 @@ public class VectorHighlighterTests {
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
reader, topDocs.scoreDocs[0].doc, "content", 30);
assertThat(fragment, notNullValue());
System.out.println(fragment);
assertThat(fragment, equalTo("e big <b>bad</b> dog "));
}
@Test