diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4dae8657e8d..bfeeb3ec572 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -117,6 +117,9 @@ New Features compute each suggestion's weight using a javascript expression. (Areek Zillur via Mike McCandless) +* LUCENE-5274: FastVectorHighlighter now supports highlighting against several + indexed fields. (Nik Everett via Adrien Grand) + Bug Fixes * LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead diff --git a/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java b/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java index dffa1e49e81..17f1460a75e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java @@ -25,6 +25,7 @@ import java.util.Map; import org.apache.lucene.search.Query; import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit; +import org.apache.lucene.util.MergedIterator; class CoalescedDeletes { final Map queries = new HashMap(); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java index 86956ddcf7d..b25d6556610 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java @@ -28,6 +28,7 @@ import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.MergedIterator; /** * Exposes flex API, merged from flex API of sub-segments. diff --git a/lucene/core/src/java/org/apache/lucene/index/MergedIterator.java b/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java similarity index 71% rename from lucene/core/src/java/org/apache/lucene/index/MergedIterator.java rename to lucene/core/src/java/org/apache/lucene/util/MergedIterator.java index 9150930780f..9a2770c0ba5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergedIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -23,32 +23,42 @@ import java.util.NoSuchElementException; import org.apache.lucene.util.PriorityQueue; /** - * Provides a merged sorted view from several sorted iterators, each - * iterating over a unique set of elements. + * Provides a merged sorted view from several sorted iterators. *

- * If an element appears in multiple iterators, it is deduplicated, - * that is this iterator returns the sorted union of elements. + * If built with removeDuplicates set to true and an element + * appears in multiple iterators then it is deduplicated, that is this iterator + * returns the sorted union of elements. + *

+ * If built with removeDuplicates set to false then all elements + * in all iterators are returned. *

* Caveats: *

* @lucene.internal */ -final class MergedIterator> implements Iterator { +public final class MergedIterator> implements Iterator { private T current; private final TermMergeQueue queue; private final SubIterator[] top; + private final boolean removeDuplicates; private int numTop; - + @SuppressWarnings({"unchecked","rawtypes"}) public MergedIterator(Iterator... iterators) { + this(true, iterators); + } + + @SuppressWarnings({"unchecked","rawtypes"}) + public MergedIterator(boolean removeDuplicates, Iterator... iterators) { + this.removeDuplicates = removeDuplicates; queue = new TermMergeQueue(iterators.length); top = new SubIterator[iterators.length]; int index = 0; @@ -100,13 +110,13 @@ final class MergedIterator> implements Iterator { } private void pullTop() { - // extract all subs from the queue that have the same top element assert numTop == 0; - while (true) { - top[numTop++] = queue.pop(); - if (queue.size() == 0 - || !(queue.top()).current.equals(top[0].current)) { - break; + top[numTop++] = queue.pop(); + if (removeDuplicates) { + // extract all subs from the queue that have the same top element + while (queue.size() != 0 + && queue.top().current.equals(top[0].current)) { + top[numTop++] = queue.pop(); } } current = top[0].current; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java b/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java index 4dcbd8d4724..3f55f07b9dc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java @@ -25,6 +25,7 @@ import java.util.Set; import java.util.TreeSet; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.MergedIterator; import org.apache.lucene.util._TestUtil; public class TestPrefixCodedTerms extends LuceneTestCase { @@ -66,15 +67,6 @@ public class TestPrefixCodedTerms extends LuceneTestCase { } assertFalse(expected.hasNext()); } - - @SuppressWarnings("unchecked") - public void testMergeEmpty() { - Iterator merged = new MergedIterator(); - assertFalse(merged.hasNext()); - - merged = new MergedIterator(new PrefixCodedTerms.Builder().finish().iterator(), new PrefixCodedTerms.Builder().finish().iterator()); - assertFalse(merged.hasNext()); - } @SuppressWarnings("unchecked") public void testMergeOne() { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java b/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java new file mode 100644 index 00000000000..e9e37a732fc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java @@ -0,0 +1,154 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Random; + +import com.carrotsearch.randomizedtesting.annotations.Repeat; + +public class TestMergedIterator extends LuceneTestCase { + private static final int REPEATS = 2; + private static final int VALS_TO_MERGE = 15000; + + @SuppressWarnings({"rawtypes", "unchecked"}) + public void testMergeEmpty() { + Iterator merged = new MergedIterator(); + assertFalse(merged.hasNext()); + + merged = new MergedIterator(new ArrayList().iterator()); + assertFalse(merged.hasNext()); + + Iterator[] itrs = new Iterator[random().nextInt(100)]; + for (int i = 0; i < itrs.length; i++) { + itrs[i] = new ArrayList().iterator(); + } + merged = new MergedIterator( itrs ); + assertFalse(merged.hasNext()); + } + + @Repeat(iterations = REPEATS) + public void testNoDupsRemoveDups() { + testCase(1, 1, true); + } + + @Repeat(iterations = REPEATS) + public void testOffItrDupsRemoveDups() { + testCase(3, 1, true); + } + + @Repeat(iterations = REPEATS) + public void testOnItrDupsRemoveDups() { + testCase(1, 3, true); + } + + @Repeat(iterations = REPEATS) + public void testOnItrRandomDupsRemoveDups() { + testCase(1, -3, true); + } + + @Repeat(iterations = REPEATS) + public void testBothDupsRemoveDups() { + testCase(3, 3, true); + } + + @Repeat(iterations = REPEATS) + public void testBothDupsWithRandomDupsRemoveDups() { + testCase(3, -3, true); + } + + @Repeat(iterations = REPEATS) + public void testNoDupsKeepDups() { + testCase(1, 1, false); + } + + @Repeat(iterations = REPEATS) + public void testOffItrDupsKeepDups() { + testCase(3, 1, false); + } + + @Repeat(iterations = REPEATS) + public void testOnItrDupsKeepDups() { + testCase(1, 3, false); + } + + @Repeat(iterations = REPEATS) + public void testOnItrRandomDupsKeepDups() { + testCase(1, -3, false); + } + + @Repeat(iterations = REPEATS) + public void testBothDupsKeepDups() { + testCase(3, 3, false); + } + + @Repeat(iterations = REPEATS) + public void testBothDupsWithRandomDupsKeepDups() { + testCase(3, -3, false); + } + + private void testCase(int itrsWithVal, int specifiedValsOnItr, boolean removeDups) { + // Build a random number of lists + List expected = new ArrayList(); + Random random = new Random(random().nextLong()); + int numLists = itrsWithVal + random.nextInt(1000 - itrsWithVal); + @SuppressWarnings({"rawtypes", "unchecked"}) + List[] lists = new List[numLists]; + for (int i = 0; i < numLists; i++) { + lists[i] = new ArrayList(); + } + int start = random.nextInt(1000000); + int end = start + VALS_TO_MERGE / itrsWithVal / Math.abs(specifiedValsOnItr); + for (int i = start; i < end; i++) { + int maxList = lists.length; + int maxValsOnItr = 0; + int sumValsOnItr = 0; + for (int itrWithVal = 0; itrWithVal < itrsWithVal; itrWithVal++) { + int list = random.nextInt(maxList); + int valsOnItr = specifiedValsOnItr < 0 ? (1 + random.nextInt(-specifiedValsOnItr)) : specifiedValsOnItr; + maxValsOnItr = Math.max(maxValsOnItr, valsOnItr); + sumValsOnItr += valsOnItr; + for (int valOnItr = 0; valOnItr < valsOnItr; valOnItr++) { + lists[list].add(i); + } + maxList = maxList - 1; + ArrayUtil.swap(lists, list, maxList); + } + int maxCount = removeDups ? maxValsOnItr : sumValsOnItr; + for (int count = 0; count < maxCount; count++) { + expected.add(i); + } + } + // Now check that they get merged cleanly + @SuppressWarnings({"rawtypes", "unchecked"}) + Iterator[] itrs = new Iterator[numLists]; + for (int i = 0; i < numLists; i++) { + itrs[i] = lists[i].iterator(); + } + + MergedIterator mergedItr = new MergedIterator(removeDups, itrs); + Iterator expectedItr = expected.iterator(); + while (expectedItr.hasNext()) { + assertTrue(mergedItr.hasNext()); + assertEquals(expectedItr.next(), mergedItr.next()); + } + assertFalse(mergedItr.hasNext()); + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java index 80fd58fd9dd..79240371133 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java @@ -18,6 +18,8 @@ package org.apache.lucene.search.vectorhighlight; */ import java.io.IOException; +import java.util.Iterator; +import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; @@ -28,7 +30,6 @@ import org.apache.lucene.search.highlight.Encoder; * */ public class FastVectorHighlighter { - public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true; public static final boolean DEFAULT_FIELD_MATCH = true; private final boolean phraseHighlight; @@ -186,15 +187,70 @@ public class FastVectorHighlighter { return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments, preTags, postTags, encoder ); } - + + /** + * Return the best fragments. Matches are scanned from matchedFields and turned into fragments against + * storedField. The highlighting may not make sense if matchedFields has matches with offsets that don't + * correspond features in storedField. It will outright throw a {@code StringIndexOutOfBoundsException} + * if matchedFields produces offsets outside of storedField. As such it is advisable that all + * matchedFields share the same source as storedField or are at least a prefix of it. + * + * @param fieldQuery {@link FieldQuery} object + * @param reader {@link IndexReader} of the index + * @param docId document id to be highlighted + * @param storedField field of the document that stores the text + * @param matchedFields fields of the document to scan for matches + * @param fragCharSize the length (number of chars) of a fragment + * @param maxNumFragments maximum number of fragments + * @param fragListBuilder {@link FragListBuilder} object + * @param fragmentsBuilder {@link FragmentsBuilder} object + * @param preTags pre-tags to be used to highlight terms + * @param postTags post-tags to be used to highlight terms + * @param encoder an encoder that generates encoded text + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException If there is a low-level I/O error + */ + public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId, + String storedField, Set< String > matchedFields, int fragCharSize, int maxNumFragments, + FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder, + String[] preTags, String[] postTags, Encoder encoder ) throws IOException { + FieldFragList fieldFragList = + getFieldFragList( fragListBuilder, fieldQuery, reader, docId, matchedFields, fragCharSize ); + return fragmentsBuilder.createFragments( reader, docId, storedField, fieldFragList, maxNumFragments, + preTags, postTags, encoder ); + } + + /** + * Build a FieldFragList for one field. + */ private FieldFragList getFieldFragList( FragListBuilder fragListBuilder, final FieldQuery fieldQuery, IndexReader reader, int docId, - String fieldName, int fragCharSize ) throws IOException { - FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery ); + String matchedField, int fragCharSize ) throws IOException { + FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, matchedField, fieldQuery ); FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery, phraseLimit ); return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize ); } + /** + * Build a FieldFragList for more than one field. + */ + private FieldFragList getFieldFragList( FragListBuilder fragListBuilder, + final FieldQuery fieldQuery, IndexReader reader, int docId, + Set< String > matchedFields, int fragCharSize ) throws IOException { + Iterator< String > matchedFieldsItr = matchedFields.iterator(); + if ( !matchedFieldsItr.hasNext() ) { + throw new IllegalArgumentException( "matchedFields must contain at least on field name." ); + } + FieldPhraseList[] toMerge = new FieldPhraseList[ matchedFields.size() ]; + int i = 0; + while ( matchedFieldsItr.hasNext() ) { + FieldTermStack stack = new FieldTermStack( reader, docId, matchedFieldsItr.next(), fieldQuery ); + toMerge[ i++ ] = new FieldPhraseList( stack, fieldQuery, phraseLimit ); + } + return fragListBuilder.createFieldFragList( new FieldPhraseList( toMerge ), fragCharSize ); + } + /** * return whether phraseHighlight or not. * diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java index 74cceb471a3..d46b4d2ffcb 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java @@ -17,18 +17,23 @@ package org.apache.lucene.search.vectorhighlight; */ import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; +import org.apache.lucene.util.MergedIterator; /** * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder * to create a FieldFragList object. */ public class FieldPhraseList { - + /** + * List of non-overlapping WeightedPhraseInfo objects. + */ LinkedList phraseList = new LinkedList(); /** @@ -106,6 +111,55 @@ public class FieldPhraseList { } } + /** + * Merging constructor. + * + * @param toMerge FieldPhraseLists to merge to build this one + */ + public FieldPhraseList( FieldPhraseList[] toMerge ) { + // Merge all overlapping WeightedPhraseInfos + // Step 1. Sort by startOffset, endOffset, and boost, in that order. + @SuppressWarnings( { "rawtypes", "unchecked" } ) + Iterator< WeightedPhraseInfo >[] allInfos = new Iterator[ toMerge.length ]; + int index = 0; + for ( FieldPhraseList fplToMerge : toMerge ) { + allInfos[ index++ ] = fplToMerge.phraseList.iterator(); + } + MergedIterator< WeightedPhraseInfo > itr = new MergedIterator< WeightedPhraseInfo >( false, allInfos ); + // Step 2. Walk the sorted list merging infos that overlap + phraseList = new LinkedList< WeightedPhraseInfo >(); + if ( !itr.hasNext() ) { + return; + } + List< WeightedPhraseInfo > work = new ArrayList< WeightedPhraseInfo >(); + WeightedPhraseInfo first = itr.next(); + work.add( first ); + int workEndOffset = first.getEndOffset(); + while ( itr.hasNext() ) { + WeightedPhraseInfo current = itr.next(); + if ( current.getStartOffset() <= workEndOffset ) { + workEndOffset = Math.max( workEndOffset, current.getEndOffset() ); + work.add( current ); + } else { + if ( work.size() == 1 ) { + phraseList.add( work.get( 0 ) ); + work.set( 0, current ); + } else { + phraseList.add( new WeightedPhraseInfo( work ) ); + work.clear(); + work.add( current ); + } + workEndOffset = current.getEndOffset(); + } + } + if ( work.size() == 1 ) { + phraseList.add( work.get( 0 ) ); + } else { + phraseList.add( new WeightedPhraseInfo( work ) ); + work.clear(); + } + } + public void addIfNoOverlap( WeightedPhraseInfo wpi ){ for( WeightedPhraseInfo existWpi : getPhraseList() ){ if( existWpi.isOffsetOverlap( wpi ) ) { @@ -121,9 +175,7 @@ public class FieldPhraseList { /** * Represents the list of term offsets and boost for some text */ - public static class WeightedPhraseInfo { - - private String text; // unnecessary member, just exists for debugging purpose + public static class WeightedPhraseInfo implements Comparable< WeightedPhraseInfo > { private List termsOffsets; // usually termsOffsets.size() == 1, // but if position-gap > 1 and slop > 0 then size() could be greater than 1 private float boost; // query boost @@ -132,10 +184,15 @@ public class FieldPhraseList { private ArrayList termsInfos; /** + * Text of the match, calculated on the fly. Use for debugging only. * @return the text */ public String getText() { - return text; + StringBuilder text = new StringBuilder(); + for ( TermInfo ti: termsInfos ) { + text.append( ti.getText() ); + } + return text.toString(); } /** @@ -174,15 +231,11 @@ public class FieldPhraseList { TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ - text = ti.getText(); return; } - StringBuilder sb = new StringBuilder(); - sb.append( ti.getText() ); int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); - sb.append( ti.getText() ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); @@ -192,7 +245,50 @@ public class FieldPhraseList { } pos = ti.getPosition(); } - text = sb.toString(); + } + + /** + * Merging constructor. Note that this just grabs seqnum from the first info. + */ + public WeightedPhraseInfo( Collection< WeightedPhraseInfo > toMerge ) { + // Pretty much the same idea as merging FieldPhraseLists: + // Step 1. Sort by startOffset, endOffset + // While we are here merge the boosts and termInfos + Iterator< WeightedPhraseInfo > toMergeItr = toMerge.iterator(); + if ( !toMergeItr.hasNext() ) { + throw new IllegalArgumentException( "toMerge must contain at least one WeightedPhraseInfo." ); + } + WeightedPhraseInfo first = toMergeItr.next(); + @SuppressWarnings( { "rawtypes", "unchecked" } ) + Iterator< Toffs >[] allToffs = new Iterator[ toMerge.size() ]; + termsInfos = new ArrayList< TermInfo >(); + seqnum = first.seqnum; + boost = first.boost; + allToffs[ 0 ] = first.termsOffsets.iterator(); + int index = 1; + while ( toMergeItr.hasNext() ) { + WeightedPhraseInfo info = toMergeItr.next(); + boost += info.boost; + termsInfos.addAll( info.termsInfos ); + allToffs[ index++ ] = info.termsOffsets.iterator(); + } + // Step 2. Walk the sorted list merging overlaps + MergedIterator< Toffs > itr = new MergedIterator< Toffs >( false, allToffs ); + termsOffsets = new ArrayList< Toffs >(); + if ( !itr.hasNext() ) { + return; + } + Toffs work = itr.next(); + while ( itr.hasNext() ) { + Toffs current = itr.next(); + if ( current.startOffset <= work.endOffset ) { + work.endOffset = Math.max( work.endOffset, current.endOffset ); + } else { + termsOffsets.add( work ); + work = current; + } + } + termsOffsets.add( work ); } public int getStartOffset(){ @@ -202,7 +298,7 @@ public class FieldPhraseList { public int getEndOffset(){ return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; } - + public boolean isOffsetOverlap( WeightedPhraseInfo other ){ int so = getStartOffset(); int eo = getEndOffset(); @@ -218,7 +314,7 @@ public class FieldPhraseList { @Override public String toString(){ StringBuilder sb = new StringBuilder(); - sb.append( text ).append( '(' ).append( boost ).append( ")(" ); + sb.append( getText() ).append( '(' ).append( boost ).append( ")(" ); for( Toffs to : termsOffsets ){ sb.append( to ); } @@ -233,10 +329,58 @@ public class FieldPhraseList { return seqnum; } + @Override + public int compareTo( WeightedPhraseInfo other ) { + int diff = getStartOffset() - other.getStartOffset(); + if ( diff != 0 ) { + return diff; + } + diff = getEndOffset() - other.getEndOffset(); + if ( diff != 0 ) { + return diff; + } + return (int) Math.signum( getBoost() - other.getBoost() ); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + getStartOffset(); + result = prime * result + getEndOffset(); + long b = Double.doubleToLongBits( getBoost() ); + result = prime * result + ( int )( b ^ ( b >>> 32 ) ); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + WeightedPhraseInfo other = (WeightedPhraseInfo) obj; + if (getStartOffset() != other.getStartOffset()) { + return false; + } + if (getEndOffset() != other.getEndOffset()) { + return false; + } + if (getBoost() != other.getBoost()) { + return false; + } + return true; + } + /** * Term offsets (start + end) */ - public static class Toffs { + public static class Toffs implements Comparable< Toffs > { private int startOffset; private int endOffset; public Toffs( int startOffset, int endOffset ){ @@ -253,6 +397,42 @@ public class FieldPhraseList { return endOffset; } @Override + public int compareTo( Toffs other ) { + int diff = getStartOffset() - other.getStartOffset(); + if ( diff != 0 ) { + return diff; + } + return getEndOffset() - other.getEndOffset(); + } + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + getStartOffset(); + result = prime * result + getEndOffset(); + return result; + } + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Toffs other = (Toffs) obj; + if (getStartOffset() != other.getStartOffset()) { + return false; + } + if (getEndOffset() != other.getEndOffset()) { + return false; + } + return true; + } + @Override public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java index 1375cfdac47..7c4534e0363 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java @@ -161,7 +161,8 @@ public class FieldTermStack { } /** - * Single term with its position/offsets in the document and IDF weight + * Single term with its position/offsets in the document and IDF weight. + * It is Comparable but considers only position. */ public static class TermInfo implements Comparable{ @@ -198,5 +199,30 @@ public class FieldTermStack { public int compareTo( TermInfo o ){ return ( this.position - o.position ); } + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + position; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + TermInfo other = (TermInfo) obj; + if (position != other.position) { + return false; + } + return true; + } } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java index 718c7eda46c..40ab5e91635 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java @@ -16,10 +16,18 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ import java.io.IOException; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AnalyzerWrapper; +import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -33,10 +41,15 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.highlight.DefaultEncoder; +import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.RegExp; public class FastVectorHighlighterTest extends LuceneTestCase { @@ -287,4 +300,222 @@ public class FastVectorHighlighterTest extends LuceneTestCase { writer.close(); dir.close(); } + + public void testMatchedFields() throws IOException { + // Searching just on the stored field doesn't highlight a stopword + matchedFieldsTestCase( false, true, "a match", "a match", + clause( "field", "a" ), clause( "field", "match" ) ); + + // Even if you add an unqueried matched field that would match it + matchedFieldsTestCase( "a match", "a match", + clause( "field", "a" ), clause( "field", "match" ) ); + + // Nor if you query the field but don't add it as a matched field to the highlighter + matchedFieldsTestCase( false, false, "a match", "a match", + clause( "field_exact", "a" ), clause( "field", "match" ) ); + + // But if you query the field and add it as a matched field to the highlighter then it is highlighted + matchedFieldsTestCase( "a match", "a match", + clause( "field_exact", "a" ), clause( "field", "match" ) ); + + // It is also ok to match just the matched field but get highlighting from the stored field + matchedFieldsTestCase( "a match", "a match", + clause( "field_exact", "a" ), clause( "field_exact", "match" ) ); + + // Boosted matched fields work too + matchedFieldsTestCase( "a match", "a match", + clause( "field_exact", 5, "a" ), clause( "field", "match" ) ); + + // It is also ok if both the stored and the matched field match the term + matchedFieldsTestCase( "a match", "a match", + clause( "field_exact", "match" ), clause( "field", "match" ) ); + + // And the highlighter respects the boosts on matched fields when sorting fragments + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field", "cat" ), clause( "field_exact", "a", "cat" ) ); + + // The same thing works across three fields as well + matchedFieldsTestCase( "cat cat CAT junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field", "cat" ), clause( "field_exact", 200, "a", "cat" ), clause( "field_super_exact", 5, "CAT" ) ); + matchedFieldsTestCase( "a cat cat junk junk junk junk junk junk junk a CAT junk junk", + "junk junk a CAT junk junk", + clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ), clause( "field_super_exact", 200, "a", "CAT" ) ); + + // And across fields with different tokenizers! + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field_exact", 5, "a", "cat" ), clause( "field_characters", "c" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field_exact", "a", "cat" ), clause( "field_characters", "c" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field_exact", "a", "cat" ), clause( "field_characters", "t" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", // See how the phrases are joined? + clause( "field", "cat" ), clause( "field_characters", 5, "c" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field", "cat" ), clause( "field_characters", 5, "a", " ", "c", "a", "t" ) ); + + // Phrases and tokens inside one another are joined + matchedFieldsTestCase( "cats wow", "cats wow", + clause( "field", "cats" ), clause( "field_tripples", "s w" ) ); + + // Everything works pretty well even if you don't require a field match + matchedFieldsTestCase( true, false, "cat cat junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field", "cat" ), clause( "field_characters", 10, "a", " ", "c", "a", "t" ) ); + + // Even boosts keep themselves pretty much intact + matchedFieldsTestCase( true, false, "a cat cat junk junk junk junk junk junk junk a CAT junk junk", + "junk junk a CAT junk junk", + clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ), clause( "field_super_exact", 200, "a", "CAT" ) ); + matchedFieldsTestCase( true, false, "cat cat CAT junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field", "cat" ), clause( "field_exact", 200, "a", "cat" ), clause( "field_super_exact", 5, "CAT" ) ); + + // Except that all the matched field matches apply even if they aren't mentioned in the query + // which can make for some confusing scoring. This isn't too big a deal, just something you + // need to think about when you don't force a field match. + matchedFieldsTestCase( true, false, "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field", "cat" ), clause( "field_characters", 4, "a", " ", "c", "a", "t" ) ); + + // It is also cool to match fields that don't have _exactly_ the same text so long as you are careful. + // In this case field_sliced is a prefix of field. + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", clause( "field_sliced", "cat" ) ); + + // Multiple matches add to the score of the segment + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field", "cat" ), clause( "field_sliced", "cat" ), clause( "field_exact", 2, "a", "cat" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "junk junk a cat junk junk", + clause( "field", "cat" ), clause( "field_sliced", "cat" ), clause( "field_exact", 4, "a", "cat" ) ); + + // Even fields with tokens on top of one another are ok + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field_der_red", 2, "der" ), clause( "field_exact", "a", "cat" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field_der_red", 2, "red" ), clause( "field_exact", "a", "cat" ) ); + matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk", + "cat cat junk junk junk junk", + clause( "field_der_red", "red" ), clause( "field_der_red", "der" ), clause( "field_exact", "a", "cat" ) ); + } + + private void matchedFieldsTestCase( String fieldValue, String expected, Query... queryClauses ) throws IOException { + matchedFieldsTestCase( true, true, fieldValue, expected, queryClauses ); + } + + private void matchedFieldsTestCase( boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses ) throws IOException { + Document doc = new Document(); + FieldType stored = new FieldType( TextField.TYPE_STORED ); + stored.setStoreTermVectorOffsets( true ); + stored.setStoreTermVectorPositions( true ); + stored.setStoreTermVectors( true ); + stored.freeze(); + FieldType matched = new FieldType( TextField.TYPE_NOT_STORED ); + matched.setStoreTermVectorOffsets( true ); + matched.setStoreTermVectorPositions( true ); + matched.setStoreTermVectors( true ); + matched.freeze(); + doc.add( new Field( "field", fieldValue, stored ) ); // Whitespace tokenized with English stop words + doc.add( new Field( "field_exact", fieldValue, matched ) ); // Whitespace tokenized without stop words + doc.add( new Field( "field_super_exact", fieldValue, matched ) ); // Whitespace tokenized without toLower + doc.add( new Field( "field_characters", fieldValue, matched ) ); // Each letter is a token + doc.add( new Field( "field_tripples", fieldValue, matched ) ); // Every three letters is a token + doc.add( new Field( "field_sliced", fieldValue.substring( 0, // Sliced at 10 chars then analyzed just like field + Math.min( fieldValue.length() - 1 , 10 ) ), matched ) ); + doc.add( new Field( "field_der_red", new CannedTokenStream( // Hacky field containing "der" and "red" at pos = 0 + token( "der", 1, 0, 3 ), + token( "red", 0, 0, 3 ) + ), matched ) ); + + final Map fieldAnalyzers = new TreeMap(); + fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) ); + fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) ); + fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) ); + fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) ); + fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) ); + fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) ); + fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) ); // This is required even though we provide a token stream + Analyzer analyzer = new AnalyzerWrapper() { + public Analyzer getWrappedAnalyzer(String fieldName) { + return fieldAnalyzers.get( fieldName ); + } + }; + + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer ) ); + writer.addDocument( doc ); + + FastVectorHighlighter highlighter = new FastVectorHighlighter(); + FragListBuilder fragListBuilder = new SimpleFragListBuilder(); + FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(); + IndexReader reader = DirectoryReader.open( writer, true ); + String[] preTags = new String[] { "" }; + String[] postTags = new String[] { "" }; + Encoder encoder = new DefaultEncoder(); + int docId = 0; + BooleanQuery query = new BooleanQuery(); + for ( Query clause : queryClauses ) { + query.add( clause, Occur.MUST ); + } + FieldQuery fieldQuery = new FieldQuery( query, reader, true, fieldMatch ); + String[] bestFragments; + if ( useMatchedFields ) { + Set< String > matchedFields = new HashSet< String >(); + matchedFields.add( "field" ); + matchedFields.add( "field_exact" ); + matchedFields.add( "field_super_exact" ); + matchedFields.add( "field_characters" ); + matchedFields.add( "field_tripples" ); + matchedFields.add( "field_sliced" ); + matchedFields.add( "field_der_red" ); + bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", matchedFields, 25, 1, + fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); + } else { + bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 25, 1, + fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); + } + assertEquals( expected, bestFragments[ 0 ] ); + + reader.close(); + writer.close(); + dir.close(); + } + + private Query clause( String field, String... terms ) { + return clause( field, 1, terms ); + } + + private Query clause( String field, float boost, String... terms ) { + Query q; + if ( terms.length == 1 ) { + q = new TermQuery( new Term( field, terms[ 0 ] ) ); + } else { + PhraseQuery pq = new PhraseQuery(); + for ( String term: terms ) { + pq.add( new Term( field, term ) ); + } + q = pq; + } + q.setBoost( boost ); + return q; + } + + private static Token token( String term, int posInc, int startOffset, int endOffset ) { + Token t = new Token( term, startOffset, endOffset ); + t.setPositionIncrement( posInc ); + return t; + } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java index 45054e6d4e1..afab821da31 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java @@ -16,8 +16,14 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ +import java.util.LinkedList; + import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; +import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; +import org.apache.lucene.util._TestUtil; public class FieldPhraseListTest extends AbstractTestCase { @@ -188,7 +194,7 @@ public class FieldPhraseListTest extends AbstractTestCase { assertEquals( 1, fpl.phraseList.size() ); assertEquals( "sppeeeed(1.0)((88,93))", fpl.phraseList.get( 0 ).toString() ); } - + /* This test shows a big speedup from limiting the number of analyzed phrases in * this bad case for FieldPhraseList */ /* But it is not reliable as a unit test since it is timing-dependent @@ -218,4 +224,68 @@ public class FieldPhraseListTest extends AbstractTestCase { assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); } */ + + public void testWeightedPhraseInfoComparisonConsistency() { + WeightedPhraseInfo a = newInfo( 0, 0, 1 ); + WeightedPhraseInfo b = newInfo( 1, 2, 1 ); + WeightedPhraseInfo c = newInfo( 2, 3, 1 ); + WeightedPhraseInfo d = newInfo( 0, 0, 1 ); + WeightedPhraseInfo e = newInfo( 0, 0, 2 ); + + assertConsistentEquals( a, a ); + assertConsistentEquals( b, b ); + assertConsistentEquals( c, c ); + assertConsistentEquals( d, d ); + assertConsistentEquals( e, e ); + assertConsistentEquals( a, d ); + assertConsistentLessThan( a, b ); + assertConsistentLessThan( b, c ); + assertConsistentLessThan( a, c ); + assertConsistentLessThan( a, e ); + assertConsistentLessThan( e, b ); + assertConsistentLessThan( e, c ); + assertConsistentLessThan( d, b ); + assertConsistentLessThan( d, c ); + assertConsistentLessThan( d, e ); + } + + public void testToffsComparisonConsistency() { + Toffs a = new Toffs( 0, 0 ); + Toffs b = new Toffs( 1, 2 ); + Toffs c = new Toffs( 2, 3 ); + Toffs d = new Toffs( 0, 0 ); + + assertConsistentEquals( a, a ); + assertConsistentEquals( b, b ); + assertConsistentEquals( c, c ); + assertConsistentEquals( d, d ); + assertConsistentEquals( a, d ); + assertConsistentLessThan( a, b ); + assertConsistentLessThan( b, c ); + assertConsistentLessThan( a, c ); + assertConsistentLessThan( d, b ); + assertConsistentLessThan( d, c ); + } + + private WeightedPhraseInfo newInfo( int startOffset, int endOffset, float boost ) { + LinkedList< TermInfo > infos = new LinkedList< TermInfo >(); + infos.add( new TermInfo( _TestUtil.randomUnicodeString( random() ), startOffset, endOffset, 0, 0 ) ); + return new WeightedPhraseInfo( infos, boost ); + } + + private < T extends Comparable< T > > void assertConsistentEquals( T a, T b ) { + assertEquals( a, b ); + assertEquals( b, a ); + assertEquals( a.hashCode(), b.hashCode() ); + assertEquals( 0, a.compareTo( b ) ); + assertEquals( 0, b.compareTo( a ) ); + } + + private < T extends Comparable< T > > void assertConsistentLessThan( T a, T b ) { + assertFalse( a.equals( b ) ); + assertFalse( b.equals( a ) ); + assertFalse( a.hashCode() == b.hashCode() ); + assertTrue( a.compareTo( b ) < 0 ); + assertTrue( b.compareTo( a ) > 0 ); + } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java index 205940859c1..2a4c6739e59 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java @@ -20,6 +20,8 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; +import org.apache.lucene.util._TestUtil; public class FieldTermStackTest extends AbstractTestCase { @@ -173,4 +175,37 @@ public class FieldTermStackTest extends AbstractTestCase { assertEquals ("the(195,198,31)", stack.pop().toString()); } + public void testTermInfoComparisonConsistency() { + TermInfo a = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 0, 1 ); + TermInfo b = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 1, 1 ); + TermInfo c = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 2, 1 ); + TermInfo d = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 0, 1 ); + + assertConsistentEquals( a, a ); + assertConsistentEquals( b, b ); + assertConsistentEquals( c, c ); + assertConsistentEquals( d, d ); + assertConsistentEquals( a, d ); + assertConsistentLessThan( a, b ); + assertConsistentLessThan( b, c ); + assertConsistentLessThan( a, c ); + assertConsistentLessThan( d, b ); + assertConsistentLessThan( d, c ); + } + + private < T extends Comparable< T > > void assertConsistentEquals( T a, T b ) { + assertEquals( a, b ); + assertEquals( b, a ); + assertEquals( a.hashCode(), b.hashCode() ); + assertEquals( 0, a.compareTo( b ) ); + assertEquals( 0, b.compareTo( a ) ); + } + + private < T extends Comparable< T > > void assertConsistentLessThan( T a, T b ) { + assertFalse( a.equals( b ) ); + assertFalse( b.equals( a ) ); + assertFalse( a.hashCode() == b.hashCode() ); + assertTrue( a.compareTo( b ) < 0 ); + assertTrue( b.compareTo( a ) > 0 ); + } }