Fixes highlight issue for multivalues fields described in issue #1994
This commit is contained in:
parent
9798d3e8fb
commit
1319ed9322
|
@ -0,0 +1,136 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.vectorhighlight;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Abstract {@link FragmentsBuilder} implementation that detects whether highlight hits occurred on a field that is
|
||||
* multivalued (Basically fields that have the same name) and splits the highlight snippets according to a single field
|
||||
* boundary. This avoids that a highlight hit is shown as one hit whilst it is actually a hit on multiple fields.
|
||||
*/
|
||||
public abstract class AbstractFragmentsBuilder extends BaseFragmentsBuilder {
|
||||
|
||||
private boolean discreteMultiValueHighlighting = true;
|
||||
|
||||
protected AbstractFragmentsBuilder(){
|
||||
super();
|
||||
}
|
||||
|
||||
protected AbstractFragmentsBuilder(BoundaryScanner boundaryScanner){
|
||||
super(boundaryScanner);
|
||||
}
|
||||
|
||||
protected AbstractFragmentsBuilder( String[] preTags, String[] postTags ){
|
||||
super(preTags, postTags);
|
||||
}
|
||||
|
||||
public AbstractFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) {
|
||||
super( preTags, postTags, bs );
|
||||
}
|
||||
|
||||
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
|
||||
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
|
||||
}
|
||||
|
||||
public String[] createFragments(IndexReader reader, int docId,
|
||||
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
|
||||
String[] preTags, String[] postTags, Encoder encoder) throws IOException {
|
||||
if (maxNumFragments < 0) {
|
||||
throw new IllegalArgumentException("maxNumFragments(" + maxNumFragments + ") must be positive number.");
|
||||
}
|
||||
|
||||
List<String> fragments = new ArrayList<String>(maxNumFragments);
|
||||
List<FieldFragList.WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
|
||||
Field[] values = getFields(reader, docId, fieldName);
|
||||
if (values.length == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (discreteMultiValueHighlighting && values.length > fragInfos.size()) {
|
||||
Map<Field, List<FieldFragList.WeightedFragInfo>> fieldsWeightedFragInfo = new HashMap<Field, List<FieldFragList.WeightedFragInfo>>();
|
||||
int startOffset = 0;
|
||||
int endOffset = 0;
|
||||
for (Field value : values) {
|
||||
endOffset += value.stringValue().length();
|
||||
List<FieldFragList.WeightedFragInfo.SubInfo> fieldToSubInfos = new ArrayList<FieldFragList.WeightedFragInfo.SubInfo>();
|
||||
List<FieldFragList.WeightedFragInfo> fieldToWeightedFragInfos = new ArrayList<FieldFragList.WeightedFragInfo>();
|
||||
fieldsWeightedFragInfo.put(value, fieldToWeightedFragInfos);
|
||||
for (FieldFragList.WeightedFragInfo fragInfo : fragInfos) {
|
||||
int weightedFragInfoStartOffset = startOffset;
|
||||
if (fragInfo.getStartOffset() > startOffset && fragInfo.getStartOffset() < endOffset) {
|
||||
weightedFragInfoStartOffset = fragInfo.getStartOffset();
|
||||
}
|
||||
int weightedFragInfoEndOffset = endOffset;
|
||||
if (fragInfo.getEndOffset() > startOffset && fragInfo.getEndOffset() < endOffset) {
|
||||
weightedFragInfoEndOffset = fragInfo.getEndOffset();
|
||||
}
|
||||
|
||||
fieldToWeightedFragInfos.add(new WeightedFragInfo(weightedFragInfoStartOffset, weightedFragInfoEndOffset, fragInfo.getTotalBoost(), fieldToSubInfos));
|
||||
for (FieldFragList.WeightedFragInfo.SubInfo subInfo : fragInfo.getSubInfos()) {
|
||||
for (FieldPhraseList.WeightedPhraseInfo.Toffs toffs : subInfo.getTermsOffsets()) {
|
||||
if (toffs.getStartOffset() >= startOffset && toffs.getEndOffset() < endOffset) {
|
||||
fieldToSubInfos.add(subInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
startOffset = endOffset + 1;
|
||||
}
|
||||
fragInfos.clear();
|
||||
for (Map.Entry<Field, List<FieldFragList.WeightedFragInfo>> entry : fieldsWeightedFragInfo.entrySet()) {
|
||||
fragInfos.addAll(entry.getValue());
|
||||
}
|
||||
Collections.sort(fragInfos, new Comparator<FieldFragList.WeightedFragInfo>() {
|
||||
|
||||
public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
|
||||
return info1.getStartOffset() - info2.getStartOffset();
|
||||
}
|
||||
|
||||
});
|
||||
fragInfos = getWeightedFragInfoList(fragInfos);
|
||||
}
|
||||
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
int[] nextValueIndex = {0};
|
||||
for (int n = 0; n < maxNumFragments && n < fragInfos.size(); n++) {
|
||||
FieldFragList.WeightedFragInfo fragInfo = fragInfos.get(n);
|
||||
fragments.add(makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder));
|
||||
}
|
||||
return fragments.toArray(new String[fragments.size()]);
|
||||
}
|
||||
|
||||
private static class WeightedFragInfo extends FieldFragList.WeightedFragInfo {
|
||||
|
||||
private final static List<FieldPhraseList.WeightedPhraseInfo> EMPTY = Collections.emptyList();
|
||||
|
||||
private WeightedFragInfo(int startOffset, int endOffset, float totalBoost, List<FieldFragList.WeightedFragInfo.SubInfo> subInfos) {
|
||||
super(startOffset, endOffset, EMPTY);
|
||||
this.subInfos = subInfos;
|
||||
this.totalBoost = totalBoost;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.lucene.search.vectorhighlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An implementation of FragmentsBuilder that outputs score-order fragments.
|
||||
*/
|
||||
public class ScoreOrderFragmentsBuilder extends AbstractFragmentsBuilder {
|
||||
|
||||
/**
|
||||
* a constructor.
|
||||
*/
|
||||
public ScoreOrderFragmentsBuilder(){
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* a constructor.
|
||||
*
|
||||
* @param preTags array of pre-tags for markup terms.
|
||||
* @param postTags array of post-tags for markup terms.
|
||||
*/
|
||||
public ScoreOrderFragmentsBuilder(String[] preTags, String[] postTags){
|
||||
super( preTags, postTags );
|
||||
}
|
||||
|
||||
public ScoreOrderFragmentsBuilder(BoundaryScanner bs){
|
||||
super( bs );
|
||||
}
|
||||
|
||||
public ScoreOrderFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs){
|
||||
super( preTags, postTags, bs );
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort by score the list of WeightedFragInfo
|
||||
*/
|
||||
@Override
|
||||
public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
|
||||
Collections.sort( src, new ScoreComparator() );
|
||||
return src;
|
||||
}
|
||||
|
||||
public static class ScoreComparator implements Comparator<WeightedFragInfo> {
|
||||
|
||||
public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
|
||||
if( o1.totalBoost > o2.totalBoost ) return -1;
|
||||
else if( o1.totalBoost < o2.totalBoost ) return 1;
|
||||
// if same score then check startOffset
|
||||
else{
|
||||
if( o1.startOffset < o2.startOffset ) return -1;
|
||||
else if( o1.startOffset > o2.startOffset ) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package org.apache.lucene.search.vectorhighlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A simple implementation of FragmentsBuilder.
|
||||
*
|
||||
*/
|
||||
public class SimpleFragmentsBuilder extends AbstractFragmentsBuilder {
|
||||
|
||||
/**
|
||||
* a constructor.
|
||||
*/
|
||||
public SimpleFragmentsBuilder() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* a constructor.
|
||||
*
|
||||
* @param preTags array of pre-tags for markup terms.
|
||||
* @param postTags array of post-tags for markup terms.
|
||||
*/
|
||||
public SimpleFragmentsBuilder(String[] preTags, String[] postTags) {
|
||||
super( preTags, postTags );
|
||||
}
|
||||
|
||||
public SimpleFragmentsBuilder(BoundaryScanner bs) {
|
||||
super( bs );
|
||||
}
|
||||
|
||||
public SimpleFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) {
|
||||
super( preTags, postTags, bs );
|
||||
}
|
||||
|
||||
/**
|
||||
* do nothing. return the source list.
|
||||
*/
|
||||
@Override
|
||||
public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
|
||||
return src;
|
||||
}
|
||||
}
|
|
@ -220,9 +220,9 @@ public class HighlightPhase implements FetchSubPhase {
|
|||
String[] fragments = null;
|
||||
// number_of_fragments is set to 0 but we have a multivalued field
|
||||
if (field.numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
|
||||
fragments = new String[1];
|
||||
fragments = new String[fragsList.size()];
|
||||
for (int i = 0; i < fragsList.size(); i++) {
|
||||
fragments[0] = (fragments[0] != null ? (fragments[0] + " ") : "") + fragsList.get(i).toString();
|
||||
fragments[i] = fragsList.get(i).toString();
|
||||
}
|
||||
} else {
|
||||
// refine numberOfFragments if needed
|
||||
|
|
|
@ -122,7 +122,8 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
assertThat(search.hits().hits().length, equalTo(5));
|
||||
|
||||
for (SearchHit hit : search.hits()) {
|
||||
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1 <em>attachment</em> 2"));
|
||||
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1"));
|
||||
assertThat(hit.highlightFields().get("attachments.body").fragments()[1], equalTo("<em>attachment</em> 2"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -167,7 +168,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
|
||||
search = client.prepareSearch()
|
||||
.setQuery(fieldQuery("attachments.body", "attachment"))
|
||||
.addHighlightedField("attachments.body", -1, 0)
|
||||
.addHighlightedField("attachments.body", -1, 2)
|
||||
.execute().actionGet();
|
||||
|
||||
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
|
||||
|
@ -176,10 +177,59 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
assertThat(search.hits().hits().length, equalTo(5));
|
||||
|
||||
for (SearchHit hit : search.hits()) {
|
||||
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1 <em>attachment</em> 2"));
|
||||
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1"));
|
||||
assertThat(hit.highlightFields().get("attachments.body").fragments()[1], equalTo("<em>attachment</em> 2"));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHighlightIssue1994() throws Exception {
|
||||
try {
|
||||
client.admin().indices().prepareDelete("test").execute().actionGet();
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
client.admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("number_of_shards", 2))
|
||||
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
|
||||
// we don't store title, now lets see if it works...
|
||||
.startObject("title").field("type", "string").field("store", "no").endObject()
|
||||
.startObject("titleTV").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").endObject()
|
||||
.endObject().endObject().endObject())
|
||||
.execute().actionGet();
|
||||
|
||||
|
||||
client.prepareIndex("test", "type1", "1")
|
||||
.setSource(XContentFactory.jsonBuilder().startObject()
|
||||
.startArray("title")
|
||||
.value("This is a test on the highlighting bug present in elasticsearch")
|
||||
.value("The bug is bugging us")
|
||||
.endArray()
|
||||
.startArray("titleTV")
|
||||
.value("This is a test on the highlighting bug present in elasticsearch")
|
||||
.value("The bug is bugging us")
|
||||
.endArray()
|
||||
.endObject())
|
||||
.setRefresh(true).execute().actionGet();
|
||||
|
||||
SearchResponse search = client.prepareSearch()
|
||||
.setQuery(fieldQuery("title", "bug"))
|
||||
.addHighlightedField("title", -1, 2)
|
||||
.addHighlightedField("titleTV", -1, 2)
|
||||
.execute().actionGet();
|
||||
|
||||
assertThat(search.hits().totalHits(), equalTo(1l));
|
||||
assertThat(search.hits().hits().length, equalTo(1));
|
||||
|
||||
assertThat(search.hits().hits()[0].highlightFields().get("title").fragments().length, equalTo(2));
|
||||
assertThat(search.hits().hits()[0].highlightFields().get("title").fragments()[0], equalTo("This is a test on the highlighting <em>bug</em> present in elasticsearch"));
|
||||
assertThat(search.hits().hits()[0].highlightFields().get("title").fragments()[1], equalTo("The <em>bug</em> is bugging us"));
|
||||
assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments().length, equalTo(2));
|
||||
// assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[0], equalTo("This is a test on the highlighting <em>bug</em> present in elasticsearch"));
|
||||
assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[0], equalTo("highlighting <em>bug</em> present in elasticsearch")); // FastVectorHighlighter starts highlighting from startOffset - margin
|
||||
assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[1], equalTo("The <em>bug</em> is bugging us"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPlainHighlighter() throws Exception {
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue