Fixes highlight issue for multivalues fields described in issue #1994

This commit is contained in:
Martijn van Groningen 2012-06-11 22:58:29 +02:00 committed by Shay Banon
parent 9798d3e8fb
commit 1319ed9322
5 changed files with 331 additions and 5 deletions

View File

@ -0,0 +1,136 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.vectorhighlight;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.highlight.Encoder;
import java.io.IOException;
import java.util.*;
/**
* Abstract {@link FragmentsBuilder} implementation that detects whether highlight hits occurred on a field that is
* multivalued (Basically fields that have the same name) and splits the highlight snippets according to a single field
* boundary. This avoids that a highlight hit is shown as one hit whilst it is actually a hit on multiple fields.
*/
public abstract class AbstractFragmentsBuilder extends BaseFragmentsBuilder {
private boolean discreteMultiValueHighlighting = true;
protected AbstractFragmentsBuilder(){
super();
}
protected AbstractFragmentsBuilder(BoundaryScanner boundaryScanner){
super(boundaryScanner);
}
protected AbstractFragmentsBuilder( String[] preTags, String[] postTags ){
super(preTags, postTags);
}
public AbstractFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) {
super( preTags, postTags, bs );
}
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
}
public String[] createFragments(IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
String[] preTags, String[] postTags, Encoder encoder) throws IOException {
if (maxNumFragments < 0) {
throw new IllegalArgumentException("maxNumFragments(" + maxNumFragments + ") must be positive number.");
}
List<String> fragments = new ArrayList<String>(maxNumFragments);
List<FieldFragList.WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
Field[] values = getFields(reader, docId, fieldName);
if (values.length == 0) {
return null;
}
if (discreteMultiValueHighlighting && values.length > fragInfos.size()) {
Map<Field, List<FieldFragList.WeightedFragInfo>> fieldsWeightedFragInfo = new HashMap<Field, List<FieldFragList.WeightedFragInfo>>();
int startOffset = 0;
int endOffset = 0;
for (Field value : values) {
endOffset += value.stringValue().length();
List<FieldFragList.WeightedFragInfo.SubInfo> fieldToSubInfos = new ArrayList<FieldFragList.WeightedFragInfo.SubInfo>();
List<FieldFragList.WeightedFragInfo> fieldToWeightedFragInfos = new ArrayList<FieldFragList.WeightedFragInfo>();
fieldsWeightedFragInfo.put(value, fieldToWeightedFragInfos);
for (FieldFragList.WeightedFragInfo fragInfo : fragInfos) {
int weightedFragInfoStartOffset = startOffset;
if (fragInfo.getStartOffset() > startOffset && fragInfo.getStartOffset() < endOffset) {
weightedFragInfoStartOffset = fragInfo.getStartOffset();
}
int weightedFragInfoEndOffset = endOffset;
if (fragInfo.getEndOffset() > startOffset && fragInfo.getEndOffset() < endOffset) {
weightedFragInfoEndOffset = fragInfo.getEndOffset();
}
fieldToWeightedFragInfos.add(new WeightedFragInfo(weightedFragInfoStartOffset, weightedFragInfoEndOffset, fragInfo.getTotalBoost(), fieldToSubInfos));
for (FieldFragList.WeightedFragInfo.SubInfo subInfo : fragInfo.getSubInfos()) {
for (FieldPhraseList.WeightedPhraseInfo.Toffs toffs : subInfo.getTermsOffsets()) {
if (toffs.getStartOffset() >= startOffset && toffs.getEndOffset() < endOffset) {
fieldToSubInfos.add(subInfo);
}
}
}
}
startOffset = endOffset + 1;
}
fragInfos.clear();
for (Map.Entry<Field, List<FieldFragList.WeightedFragInfo>> entry : fieldsWeightedFragInfo.entrySet()) {
fragInfos.addAll(entry.getValue());
}
Collections.sort(fragInfos, new Comparator<FieldFragList.WeightedFragInfo>() {
public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
return info1.getStartOffset() - info2.getStartOffset();
}
});
fragInfos = getWeightedFragInfoList(fragInfos);
}
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = {0};
for (int n = 0; n < maxNumFragments && n < fragInfos.size(); n++) {
FieldFragList.WeightedFragInfo fragInfo = fragInfos.get(n);
fragments.add(makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder));
}
return fragments.toArray(new String[fragments.size()]);
}
private static class WeightedFragInfo extends FieldFragList.WeightedFragInfo {
private final static List<FieldPhraseList.WeightedPhraseInfo> EMPTY = Collections.emptyList();
private WeightedFragInfo(int startOffset, int endOffset, float totalBoost, List<FieldFragList.WeightedFragInfo.SubInfo> subInfos) {
super(startOffset, endOffset, EMPTY);
this.subInfos = subInfos;
this.totalBoost = totalBoost;
}
}
}

View File

@ -0,0 +1,78 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* An implementation of FragmentsBuilder that outputs score-order fragments.
*/
public class ScoreOrderFragmentsBuilder extends AbstractFragmentsBuilder {
/**
* a constructor.
*/
public ScoreOrderFragmentsBuilder(){
super();
}
/**
* a constructor.
*
* @param preTags array of pre-tags for markup terms.
* @param postTags array of post-tags for markup terms.
*/
public ScoreOrderFragmentsBuilder(String[] preTags, String[] postTags){
super( preTags, postTags );
}
public ScoreOrderFragmentsBuilder(BoundaryScanner bs){
super( bs );
}
public ScoreOrderFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs){
super( preTags, postTags, bs );
}
/**
* Sort by score the list of WeightedFragInfo
*/
@Override
public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
Collections.sort( src, new ScoreComparator() );
return src;
}
public static class ScoreComparator implements Comparator<WeightedFragInfo> {
public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
if( o1.totalBoost > o2.totalBoost ) return -1;
else if( o1.totalBoost < o2.totalBoost ) return 1;
// if same score then check startOffset
else{
if( o1.startOffset < o2.startOffset ) return -1;
else if( o1.startOffset > o2.startOffset ) return 1;
}
return 0;
}
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import java.util.List;
/**
* A simple implementation of FragmentsBuilder.
*
*/
public class SimpleFragmentsBuilder extends AbstractFragmentsBuilder {
/**
* a constructor.
*/
public SimpleFragmentsBuilder() {
super();
}
/**
* a constructor.
*
* @param preTags array of pre-tags for markup terms.
* @param postTags array of post-tags for markup terms.
*/
public SimpleFragmentsBuilder(String[] preTags, String[] postTags) {
super( preTags, postTags );
}
public SimpleFragmentsBuilder(BoundaryScanner bs) {
super( bs );
}
public SimpleFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) {
super( preTags, postTags, bs );
}
/**
* do nothing. return the source list.
*/
@Override
public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
return src;
}
}

View File

@ -220,9 +220,9 @@ public class HighlightPhase implements FetchSubPhase {
String[] fragments = null;
// number_of_fragments is set to 0 but we have a multivalued field
if (field.numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
fragments = new String[1];
fragments = new String[fragsList.size()];
for (int i = 0; i < fragsList.size(); i++) {
fragments[0] = (fragments[0] != null ? (fragments[0] + " ") : "") + fragsList.get(i).toString();
fragments[i] = fragsList.get(i).toString();
}
} else {
// refine numberOfFragments if needed

View File

@ -122,7 +122,8 @@ public class HighlighterSearchTests extends AbstractNodesTests {
assertThat(search.hits().hits().length, equalTo(5));
for (SearchHit hit : search.hits()) {
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1 <em>attachment</em> 2"));
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1"));
assertThat(hit.highlightFields().get("attachments.body").fragments()[1], equalTo("<em>attachment</em> 2"));
}
}
@ -167,7 +168,7 @@ public class HighlighterSearchTests extends AbstractNodesTests {
search = client.prepareSearch()
.setQuery(fieldQuery("attachments.body", "attachment"))
.addHighlightedField("attachments.body", -1, 0)
.addHighlightedField("attachments.body", -1, 2)
.execute().actionGet();
assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0));
@ -176,10 +177,59 @@ public class HighlighterSearchTests extends AbstractNodesTests {
assertThat(search.hits().hits().length, equalTo(5));
for (SearchHit hit : search.hits()) {
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1 <em>attachment</em> 2"));
assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("<em>attachment</em> 1"));
assertThat(hit.highlightFields().get("attachments.body").fragments()[1], equalTo("<em>attachment</em> 2"));
}
}
@Test
public void testHighlightIssue1994() throws Exception {
try {
client.admin().indices().prepareDelete("test").execute().actionGet();
} catch (Exception e) {
// ignore
}
client.admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
// we don't store title, now lets see if it works...
.startObject("title").field("type", "string").field("store", "no").endObject()
.startObject("titleTV").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").endObject()
.endObject().endObject().endObject())
.execute().actionGet();
client.prepareIndex("test", "type1", "1")
.setSource(XContentFactory.jsonBuilder().startObject()
.startArray("title")
.value("This is a test on the highlighting bug present in elasticsearch")
.value("The bug is bugging us")
.endArray()
.startArray("titleTV")
.value("This is a test on the highlighting bug present in elasticsearch")
.value("The bug is bugging us")
.endArray()
.endObject())
.setRefresh(true).execute().actionGet();
SearchResponse search = client.prepareSearch()
.setQuery(fieldQuery("title", "bug"))
.addHighlightedField("title", -1, 2)
.addHighlightedField("titleTV", -1, 2)
.execute().actionGet();
assertThat(search.hits().totalHits(), equalTo(1l));
assertThat(search.hits().hits().length, equalTo(1));
assertThat(search.hits().hits()[0].highlightFields().get("title").fragments().length, equalTo(2));
assertThat(search.hits().hits()[0].highlightFields().get("title").fragments()[0], equalTo("This is a test on the highlighting <em>bug</em> present in elasticsearch"));
assertThat(search.hits().hits()[0].highlightFields().get("title").fragments()[1], equalTo("The <em>bug</em> is bugging us"));
assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments().length, equalTo(2));
// assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[0], equalTo("This is a test on the highlighting <em>bug</em> present in elasticsearch"));
assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[0], equalTo("highlighting <em>bug</em> present in elasticsearch")); // FastVectorHighlighter starts highlighting from startOffset - margin
assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[1], equalTo("The <em>bug</em> is bugging us"));
}
@Test
public void testPlainHighlighter() throws Exception {
try {