diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/AbstractFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/AbstractFragmentsBuilder.java new file mode 100644 index 00000000000..a0a794c850c --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/AbstractFragmentsBuilder.java @@ -0,0 +1,136 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.search.vectorhighlight; + +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.highlight.Encoder; + +import java.io.IOException; +import java.util.*; + +/** + * Abstract {@link FragmentsBuilder} implementation that detects whether highlight hits occurred on a field that is + * multivalued (Basically fields that have the same name) and splits the highlight snippets according to a single field + * boundary. This avoids that a highlight hit is shown as one hit whilst it is actually a hit on multiple fields. + */ +public abstract class AbstractFragmentsBuilder extends BaseFragmentsBuilder { + + private boolean discreteMultiValueHighlighting = true; + + protected AbstractFragmentsBuilder(){ + super(); + } + + protected AbstractFragmentsBuilder(BoundaryScanner boundaryScanner){ + super(boundaryScanner); + } + + protected AbstractFragmentsBuilder( String[] preTags, String[] postTags ){ + super(preTags, postTags); + } + + public AbstractFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) { + super( preTags, postTags, bs ); + } + + public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) { + this.discreteMultiValueHighlighting = discreteMultiValueHighlighting; + } + + public String[] createFragments(IndexReader reader, int docId, + String fieldName, FieldFragList fieldFragList, int maxNumFragments, + String[] preTags, String[] postTags, Encoder encoder) throws IOException { + if (maxNumFragments < 0) { + throw new IllegalArgumentException("maxNumFragments(" + maxNumFragments + ") must be positive number."); + } + + List fragments = new ArrayList(maxNumFragments); + List fragInfos = fieldFragList.getFragInfos(); + Field[] values = getFields(reader, docId, fieldName); + if (values.length == 0) { + return null; + } + + if (discreteMultiValueHighlighting && values.length > fragInfos.size()) { + Map> fieldsWeightedFragInfo = new HashMap>(); + int startOffset = 0; + int endOffset = 0; + for (Field value : values) { + endOffset += value.stringValue().length(); + List fieldToSubInfos = new ArrayList(); + List fieldToWeightedFragInfos = new ArrayList(); + fieldsWeightedFragInfo.put(value, fieldToWeightedFragInfos); + for (FieldFragList.WeightedFragInfo fragInfo : fragInfos) { + int weightedFragInfoStartOffset = startOffset; + if (fragInfo.getStartOffset() > startOffset && fragInfo.getStartOffset() < endOffset) { + weightedFragInfoStartOffset = fragInfo.getStartOffset(); + } + int weightedFragInfoEndOffset = endOffset; + if (fragInfo.getEndOffset() > startOffset && fragInfo.getEndOffset() < endOffset) { + weightedFragInfoEndOffset = fragInfo.getEndOffset(); + } + + fieldToWeightedFragInfos.add(new WeightedFragInfo(weightedFragInfoStartOffset, weightedFragInfoEndOffset, fragInfo.getTotalBoost(), fieldToSubInfos)); + for (FieldFragList.WeightedFragInfo.SubInfo subInfo : fragInfo.getSubInfos()) { + for (FieldPhraseList.WeightedPhraseInfo.Toffs toffs : subInfo.getTermsOffsets()) { + if (toffs.getStartOffset() >= startOffset && toffs.getEndOffset() < endOffset) { + fieldToSubInfos.add(subInfo); + } + } + } + } + startOffset = endOffset + 1; + } + fragInfos.clear(); + for (Map.Entry> entry : fieldsWeightedFragInfo.entrySet()) { + fragInfos.addAll(entry.getValue()); + } + Collections.sort(fragInfos, new Comparator() { + + public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) { + return info1.getStartOffset() - info2.getStartOffset(); + } + + }); + fragInfos = getWeightedFragInfoList(fragInfos); + } + + StringBuilder buffer = new StringBuilder(); + int[] nextValueIndex = {0}; + for (int n = 0; n < maxNumFragments && n < fragInfos.size(); n++) { + FieldFragList.WeightedFragInfo fragInfo = fragInfos.get(n); + fragments.add(makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder)); + } + return fragments.toArray(new String[fragments.size()]); + } + + private static class WeightedFragInfo extends FieldFragList.WeightedFragInfo { + + private final static List EMPTY = Collections.emptyList(); + + private WeightedFragInfo(int startOffset, int endOffset, float totalBoost, List subInfos) { + super(startOffset, endOffset, EMPTY); + this.subInfos = subInfos; + this.totalBoost = totalBoost; + } + } + +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java new file mode 100644 index 00000000000..5e8f025dd5c --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java @@ -0,0 +1,78 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +/** + * An implementation of FragmentsBuilder that outputs score-order fragments. + */ +public class ScoreOrderFragmentsBuilder extends AbstractFragmentsBuilder { + + /** + * a constructor. + */ + public ScoreOrderFragmentsBuilder(){ + super(); + } + + /** + * a constructor. + * + * @param preTags array of pre-tags for markup terms. + * @param postTags array of post-tags for markup terms. + */ + public ScoreOrderFragmentsBuilder(String[] preTags, String[] postTags){ + super( preTags, postTags ); + } + + public ScoreOrderFragmentsBuilder(BoundaryScanner bs){ + super( bs ); + } + + public ScoreOrderFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs){ + super( preTags, postTags, bs ); + } + + /** + * Sort by score the list of WeightedFragInfo + */ + @Override + public List getWeightedFragInfoList( List src ) { + Collections.sort( src, new ScoreComparator() ); + return src; + } + + public static class ScoreComparator implements Comparator { + + public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) { + if( o1.totalBoost > o2.totalBoost ) return -1; + else if( o1.totalBoost < o2.totalBoost ) return 1; + // if same score then check startOffset + else{ + if( o1.startOffset < o2.startOffset ) return -1; + else if( o1.startOffset > o2.startOffset ) return 1; + } + return 0; + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java new file mode 100644 index 00000000000..ff649f9ae67 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java @@ -0,0 +1,62 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; + +import java.util.List; + +/** + * A simple implementation of FragmentsBuilder. + * + */ +public class SimpleFragmentsBuilder extends AbstractFragmentsBuilder { + + /** + * a constructor. + */ + public SimpleFragmentsBuilder() { + super(); + } + + /** + * a constructor. + * + * @param preTags array of pre-tags for markup terms. + * @param postTags array of post-tags for markup terms. + */ + public SimpleFragmentsBuilder(String[] preTags, String[] postTags) { + super( preTags, postTags ); + } + + public SimpleFragmentsBuilder(BoundaryScanner bs) { + super( bs ); + } + + public SimpleFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) { + super( preTags, postTags, bs ); + } + + /** + * do nothing. return the source list. + */ + @Override + public List getWeightedFragInfoList( List src ) { + return src; + } +} diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java index 8c671cb3d2a..0549ec0689c 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java @@ -220,9 +220,9 @@ public class HighlightPhase implements FetchSubPhase { String[] fragments = null; // number_of_fragments is set to 0 but we have a multivalued field if (field.numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) { - fragments = new String[1]; + fragments = new String[fragsList.size()]; for (int i = 0; i < fragsList.size(); i++) { - fragments[0] = (fragments[0] != null ? (fragments[0] + " ") : "") + fragsList.get(i).toString(); + fragments[i] = fragsList.get(i).toString(); } } else { // refine numberOfFragments if needed diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index b1fb300077d..68ee978925d 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -122,7 +122,8 @@ public class HighlighterSearchTests extends AbstractNodesTests { assertThat(search.hits().hits().length, equalTo(5)); for (SearchHit hit : search.hits()) { - assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("attachment 1 attachment 2")); + assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("attachment 1")); + assertThat(hit.highlightFields().get("attachments.body").fragments()[1], equalTo("attachment 2")); } } @@ -167,7 +168,7 @@ public class HighlighterSearchTests extends AbstractNodesTests { search = client.prepareSearch() .setQuery(fieldQuery("attachments.body", "attachment")) - .addHighlightedField("attachments.body", -1, 0) + .addHighlightedField("attachments.body", -1, 2) .execute().actionGet(); assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); @@ -176,10 +177,59 @@ public class HighlighterSearchTests extends AbstractNodesTests { assertThat(search.hits().hits().length, equalTo(5)); for (SearchHit hit : search.hits()) { - assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("attachment 1 attachment 2")); + assertThat(hit.highlightFields().get("attachments.body").fragments()[0], equalTo("attachment 1")); + assertThat(hit.highlightFields().get("attachments.body").fragments()[1], equalTo("attachment 2")); } } + @Test + public void testHighlightIssue1994() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + + client.admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("number_of_shards", 2)) + .addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties") + // we don't store title, now lets see if it works... + .startObject("title").field("type", "string").field("store", "no").endObject() + .startObject("titleTV").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").endObject() + .endObject().endObject().endObject()) + .execute().actionGet(); + + + client.prepareIndex("test", "type1", "1") + .setSource(XContentFactory.jsonBuilder().startObject() + .startArray("title") + .value("This is a test on the highlighting bug present in elasticsearch") + .value("The bug is bugging us") + .endArray() + .startArray("titleTV") + .value("This is a test on the highlighting bug present in elasticsearch") + .value("The bug is bugging us") + .endArray() + .endObject()) + .setRefresh(true).execute().actionGet(); + + SearchResponse search = client.prepareSearch() + .setQuery(fieldQuery("title", "bug")) + .addHighlightedField("title", -1, 2) + .addHighlightedField("titleTV", -1, 2) + .execute().actionGet(); + + assertThat(search.hits().totalHits(), equalTo(1l)); + assertThat(search.hits().hits().length, equalTo(1)); + + assertThat(search.hits().hits()[0].highlightFields().get("title").fragments().length, equalTo(2)); + assertThat(search.hits().hits()[0].highlightFields().get("title").fragments()[0], equalTo("This is a test on the highlighting bug present in elasticsearch")); + assertThat(search.hits().hits()[0].highlightFields().get("title").fragments()[1], equalTo("The bug is bugging us")); + assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments().length, equalTo(2)); +// assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[0], equalTo("This is a test on the highlighting bug present in elasticsearch")); + assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[0], equalTo("highlighting bug present in elasticsearch")); // FastVectorHighlighter starts highlighting from startOffset - margin + assertThat(search.hits().hits()[0].highlightFields().get("titleTV").fragments()[1], equalTo("The bug is bugging us")); + } + @Test public void testPlainHighlighter() throws Exception { try {