diff --git a/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java b/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java new file mode 100644 index 00000000000..7a92ae7e327 --- /dev/null +++ b/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java @@ -0,0 +1,62 @@ +package org.elasticsearch.common.lucene.search.vectorhighlight; + +import gnu.trove.set.hash.TCharHashSet; +import org.apache.lucene.search.vectorhighlight.BoundaryScanner; + +/** + * A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}. + *
+ * Uses specialized char set to lookup boundary, and fixes a problem with start offset in the + * beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem + * with multiple empty fields to highlight...). + */ +public class SimpleBoundaryScanner2 implements BoundaryScanner { + + public static final int DEFAULT_MAX_SCAN = 20; + public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'}; + + public static final SimpleBoundaryScanner2 DEFAULT = new SimpleBoundaryScanner2(); + + public int maxScan; + public TCharHashSet boundaryChars; + + public SimpleBoundaryScanner2() { + this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS); + } + + public SimpleBoundaryScanner2(int maxScan, char[] boundaryChars) { + this.maxScan = maxScan; + this.boundaryChars = new TCharHashSet(boundaryChars); + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if (start > buffer.length() || start < 1) return start; + int offset, count = maxScan; + for (offset = start; offset > 0 && count > 0; count--) { + // found? + if (boundaryChars.contains(buffer.charAt(offset - 1))) return offset; + offset--; + } + // LUCENE-3697 + if (offset == 0) { + return 0; + } + // not found + return start; + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if (start > buffer.length() || start < 0) return start; + int offset, count = maxScan; + //for( offset = start; offset <= buffer.length() && count > 0; count-- ){ + for (offset = start; offset < buffer.length() && count > 0; count--) { + // found? + if (boundaryChars.contains(buffer.charAt(offset))) return offset; + offset++; + } + // not found + return start; + } +} diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java index a7006f79da4..6ddaa384067 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java @@ -36,6 +36,7 @@ import org.elasticsearch.common.io.FastStringReader; import org.elasticsearch.common.lucene.document.SingleFieldSelector; import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery; import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; +import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MapperService; @@ -237,13 +238,19 @@ public class HighlightPhase implements FetchSubPhase { if (entry == null) { FragListBuilder fragListBuilder; FragmentsBuilder fragmentsBuilder; + + BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT; + if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) { + boundaryScanner = new SimpleBoundaryScanner2(field.boundaryMaxScan(), field.boundaryChars()); + } + if (field.numberOfFragments() == 0) { fragListBuilder = new SingleFragListBuilder(); if (mapper.stored()) { - fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags()); + fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); } else { - fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags()); + fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); } } else { if (field.fragmentOffset() == -1) @@ -253,15 +260,15 @@ public class HighlightPhase implements FetchSubPhase { if (field.scoreOrdered()) { if (mapper.stored()) { - fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags()); + fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); } else { - fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags()); + fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); } } else { if (mapper.stored()) { - fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags()); + fragmentsBuilder = new SimpleFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); } else { - fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags()); + fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); } } } diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java b/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java index d204ec7b4a7..cbc104e8bd0 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java @@ -20,6 +20,7 @@ package org.elasticsearch.search.highlight; import com.google.common.collect.Lists; +import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.search.SearchParseElement; import org.elasticsearch.search.SearchParseException; @@ -73,6 +74,8 @@ public class HighlighterParseElement implements SearchParseElement { int globalFragmentSize = 100; int globalNumOfFragments = 5; String globalEncoder = "default"; + int globalBoundaryMaxScan = SimpleBoundaryScanner2.DEFAULT_MAX_SCAN; + char[] globalBoundaryChars = SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { @@ -110,6 +113,10 @@ public class HighlighterParseElement implements SearchParseElement { globalEncoder = parser.text(); } else if ("require_field_match".equals(topLevelFieldName) || "requireFieldMatch".equals(topLevelFieldName)) { globalRequireFieldMatch = parser.booleanValue(); + } else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) { + globalBoundaryMaxScan = parser.intValue(); + } else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) { + globalBoundaryChars = parser.text().toCharArray(); } } else if (token == XContentParser.Token.START_OBJECT) { if ("fields".equals(topLevelFieldName)) { @@ -150,6 +157,10 @@ public class HighlighterParseElement implements SearchParseElement { field.scoreOrdered("score".equals(parser.text())); } else if ("require_field_match".equals(fieldName) || "requireFieldMatch".equals(fieldName)) { field.requireFieldMatch(parser.booleanValue()); + } else if ("boundary_max_scan".equals(topLevelFieldName) || "boundaryMaxScan".equals(topLevelFieldName)) { + field.boundaryMaxScan(parser.intValue()); + } else if ("boundary_chars".equals(topLevelFieldName) || "boundaryChars".equals(topLevelFieldName)) { + field.boundaryChars(parser.text().toCharArray()); } } } @@ -189,6 +200,12 @@ public class HighlighterParseElement implements SearchParseElement { if (field.requireFieldMatch() == null) { field.requireFieldMatch(globalRequireFieldMatch); } + if (field.boundaryMaxScan() == -1) { + field.boundaryMaxScan(globalBoundaryMaxScan); + } + if (field.boundaryChars() == null) { + field.boundaryChars(globalBoundaryChars); + } } context.highlight(new SearchContextHighlight(fields)); diff --git a/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java b/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java index 228781b4023..10d2878bf3f 100644 --- a/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java +++ b/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java @@ -58,6 +58,9 @@ public class SearchContextHighlight { private Boolean requireFieldMatch; + private int boundaryMaxScan = -1; + private char[] boundaryChars = null; + public Field(String field) { this.field = field; } @@ -137,5 +140,21 @@ public class SearchContextHighlight { public void requireFieldMatch(boolean requireFieldMatch) { this.requireFieldMatch = requireFieldMatch; } + + public int boundaryMaxScan() { + return boundaryMaxScan; + } + + public void boundaryMaxScan(int boundaryMaxScan) { + this.boundaryMaxScan = boundaryMaxScan; + } + + public char[] boundaryChars() { + return boundaryChars; + } + + public void boundaryChars(char[] boundaryChars) { + this.boundaryChars = boundaryChars; + } } } diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java index a13e95704c5..0443616ca05 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.internal.SearchContext; @@ -39,8 +40,8 @@ public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder private final SearchContext searchContext; public SourceScoreOrderFragmentsBuilder(FieldMapper mapper, SearchContext searchContext, - String[] preTags, String[] postTags) { - super(preTags, postTags); + String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) { + super(preTags, postTags, boundaryScanner); this.mapper = mapper; this.searchContext = searchContext; } diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java index 48bcc18f715..2cb35f01546 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceSimpleFragmentsBuilder.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.highlight.vectorhighlight; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.internal.SearchContext; @@ -39,8 +40,8 @@ public class SourceSimpleFragmentsBuilder extends SimpleFragmentsBuilder { private final SearchContext searchContext; public SourceSimpleFragmentsBuilder(FieldMapper mapper, SearchContext searchContext, - String[] preTags, String[] postTags) { - super(preTags, postTags); + String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) { + super(preTags, postTags, boundaryScanner); this.mapper = mapper; this.searchContext = searchContext; } diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index 6a3d5f9bab6..21811ecee0e 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -555,7 +555,7 @@ public class HighlighterSearchTests extends AbstractNodesTests { assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); SearchHit hit = search.hits().getAt(0); - assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a test ")); + assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a test ")); // search on title.key and highlight on title search = client.prepareSearch() @@ -596,7 +596,7 @@ public class HighlighterSearchTests extends AbstractNodesTests { assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); SearchHit hit = search.hits().getAt(0); - assertThat(hit.highlightFields().get("title").fragments()[0], equalTo(" is a test ")); + assertThat(hit.highlightFields().get("title").fragments()[0], equalTo("this is a test ")); // search on title.key and highlight on title.key search = client.prepareSearch() diff --git a/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java b/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java index 08039790693..fde13951bf8 100644 --- a/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java +++ b/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java @@ -60,7 +60,7 @@ public class VectorHighlighterTests { String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), reader, topDocs.scoreDocs[0].doc, "content", 30); assertThat(fragment, notNullValue()); - System.out.println(fragment); + assertThat(fragment, equalTo("e big bad dog ")); } @Test