diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java index 5fccc6a8422..679bff0cd21 100644 --- a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java @@ -40,7 +40,7 @@ import java.util.List; * */ // LUCENE MONITOR -public class CustomFieldQuery extends FieldQuery { +public class CustomFieldQuery extends XFieldQuery { private static Field multiTermQueryWrapperFilterQueryField; @@ -55,7 +55,7 @@ public class CustomFieldQuery extends FieldQuery { public static final ThreadLocal highlightFilters = new ThreadLocal(); - public CustomFieldQuery(Query query, IndexReader reader, FastVectorHighlighter highlighter) throws IOException { + public CustomFieldQuery(Query query, IndexReader reader, XFastVectorHighlighter highlighter) throws IOException { this(query, reader, highlighter.isPhraseHighlight(), highlighter.isFieldMatch()); } diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XBaseFragListBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XBaseFragListBuilder.java new file mode 100644 index 00000000000..2e12f8db2c9 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XBaseFragListBuilder.java @@ -0,0 +1,144 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * A abstract implementation of {@link XFragListBuilder}. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public abstract class XBaseFragListBuilder implements XFragListBuilder { + + public static final int MARGIN_DEFAULT = 6; + public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3; + + final int margin; + final int minFragCharSize; + + public XBaseFragListBuilder( int margin ){ + if( margin < 0 ) + throw new IllegalArgumentException( "margin(" + margin + ") is too small. It must be 0 or higher." ); + + this.margin = margin; + this.minFragCharSize = Math.max( 1, margin * MIN_FRAG_CHAR_SIZE_FACTOR ); + } + + public XBaseFragListBuilder(){ + this( MARGIN_DEFAULT ); + } + + protected XFieldFragList createFieldFragList( XFieldPhraseList fieldPhraseList, XFieldFragList fieldFragList, int fragCharSize ){ + if( fragCharSize < minFragCharSize ) + throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); + + List wpil = new ArrayList(); + IteratorQueue queue = new IteratorQueue(fieldPhraseList.getPhraseList().iterator()); + WeightedPhraseInfo phraseInfo = null; + int startOffset = 0; + while((phraseInfo = queue.top()) != null){ + // if the phrase violates the border of previous fragment, discard it and try next phrase + if( phraseInfo.getStartOffset() < startOffset ) { + queue.removeTop(); + continue; + } + + wpil.clear(); + final int currentPhraseStartOffset = phraseInfo.getStartOffset(); + int currentPhraseEndOffset = phraseInfo.getEndOffset(); + int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); + int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); + } + while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd + if (phraseInfo.getEndOffset() <= spanEnd) { + currentPhraseEndOffset = phraseInfo.getEndOffset(); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); + } + } else { + break; + } + } + if (wpil.isEmpty()) { + continue; + } + + final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset; + // now recalculate the start and end position to "center" the result + final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here + spanStart = currentPhraseStartOffset - newMargin; + if (spanStart < startOffset) { + spanStart = startOffset; + } + // whatever is bigger here we grow this out + spanEnd = spanStart + Math.max(matchLen, fragCharSize); + startOffset = spanEnd; + fieldFragList.add(spanStart, spanEnd, wpil); + } + return fieldFragList; + } + + /** + * A predicate to decide if the given {@link WeightedPhraseInfo} should be + * accepted as a highlighted phrase or if it should be discarded. + *

+ * The default implementation discards phrases that are composed of more than one term + * and where the matchLength exceeds the fragment character size. + * + * @param info the phrase info to accept + * @param matchLength the match length of the current phrase + * @param fragCharSize the configured fragment character size + * @return true if this phrase info should be accepted as a highligh phrase + */ + protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) { + return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize; + } + + private static final class IteratorQueue { + private final Iterator iter; + private T top; + + public IteratorQueue(Iterator iter) { + this.iter = iter; + T removeTop = removeTop(); + assert removeTop == null; + } + + public T top() { + return top; + } + + public T removeTop() { + T currentTop = top; + if (iter.hasNext()) { + top = iter.next(); + } else { + top = null; + } + return currentTop; + } + + } + +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XBaseFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XBaseFragmentsBuilder.java new file mode 100644 index 00000000000..87a393c8f3f --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XBaseFragmentsBuilder.java @@ -0,0 +1,332 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.search.highlight.DefaultEncoder; +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo.Toffs; +import org.apache.lucene.util.CollectionUtil; + +import java.io.IOException; +import java.util.*; + +/** + * Base FragmentsBuilder implementation that supports colored pre/post + * tags and multivalued fields. + *

+ * Uses {@link XBoundaryScanner} to determine fragments. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public abstract class XBaseFragmentsBuilder implements XFragmentsBuilder { + + protected String[] preTags, postTags; + public static final String[] COLORED_PRE_TAGS = { + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "" + }; + public static final String[] COLORED_POST_TAGS = { "" }; + private char multiValuedSeparator = ' '; + private final BoundaryScanner boundaryScanner; + private boolean discreteMultiValueHighlighting = false; + + protected XBaseFragmentsBuilder(){ + this( new String[]{ "" }, new String[]{ "" } ); + } + + protected XBaseFragmentsBuilder( String[] preTags, String[] postTags ){ + this(preTags, postTags, new SimpleBoundaryScanner()); + } + + protected XBaseFragmentsBuilder(BoundaryScanner boundaryScanner){ + this( new String[]{ "" }, new String[]{ "" }, boundaryScanner ); + } + + protected XBaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){ + this.preTags = preTags; + this.postTags = postTags; + this.boundaryScanner = boundaryScanner; + } + + static Object checkTagsArgument( Object tags ){ + if( tags instanceof String ) return tags; + else if( tags instanceof String[] ) return tags; + throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" ); + } + + public abstract List getWeightedFragInfoList( List src ); + + private static final Encoder NULL_ENCODER = new DefaultEncoder(); + + @Override + public String createFragment( IndexReader reader, int docId, + String fieldName, XFieldFragList fieldFragList ) throws IOException { + return createFragment( reader, docId, fieldName, fieldFragList, + preTags, postTags, NULL_ENCODER ); + } + + @Override + public String[] createFragments( IndexReader reader, int docId, + String fieldName, XFieldFragList fieldFragList, int maxNumFragments ) + throws IOException { + return createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments, + preTags, postTags, NULL_ENCODER ); + } + + @Override + public String createFragment( IndexReader reader, int docId, + String fieldName, XFieldFragList fieldFragList, String[] preTags, String[] postTags, + Encoder encoder ) throws IOException { + String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1, + preTags, postTags, encoder ); + if( fragments == null || fragments.length == 0 ) return null; + return fragments[0]; + } + + @Override + public String[] createFragments( IndexReader reader, int docId, + String fieldName, XFieldFragList fieldFragList, int maxNumFragments, + String[] preTags, String[] postTags, Encoder encoder ) throws IOException { + + if( maxNumFragments < 0 ) { + throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." ); + } + + List fragInfos = fieldFragList.getFragInfos(); + Field[] values = getFields( reader, docId, fieldName ); + if( values.length == 0 ) { + return null; + } + + if (discreteMultiValueHighlighting && values.length > 1) { + fragInfos = discreteMultiValueHighlighting(fragInfos, values); + } + + fragInfos = getWeightedFragInfoList(fragInfos); + int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size(); + List fragments = new ArrayList( limitFragments ); + + StringBuilder buffer = new StringBuilder(); + int[] nextValueIndex = { 0 }; + for( int n = 0; n < limitFragments; n++ ){ + WeightedFragInfo fragInfo = fragInfos.get( n ); + fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) ); + } + return fragments.toArray( new String[fragments.size()] ); + } + + protected Field[] getFields( IndexReader reader, int docId, final String fieldName) throws IOException { + // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field??? + final List fields = new ArrayList(); + reader.document(docId, new StoredFieldVisitor() { + + @Override + public void stringField(FieldInfo fieldInfo, String value) { + FieldType ft = new FieldType(TextField.TYPE_STORED); + ft.setStoreTermVectors(fieldInfo.hasVectors()); + fields.add(new Field(fieldInfo.name, value, ft)); + } + + @Override + public Status needsField(FieldInfo fieldInfo) { + return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO; + } + }); + return fields.toArray(new Field[fields.size()]); + } + + protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, + String[] preTags, String[] postTags, Encoder encoder ){ + StringBuilder fragment = new StringBuilder(); + final int s = fragInfo.getStartOffset(); + int[] modifiedStartOffset = { s }; + String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset ); + int srcIndex = 0; + for( SubInfo subInfo : fragInfo.getSubInfos() ){ + for( Toffs to : subInfo.getTermsOffsets() ){ + fragment + .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) ) + .append( getPreTag( preTags, subInfo.getSeqnum() ) ) + .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) ) + .append( getPostTag( postTags, subInfo.getSeqnum() ) ); + srcIndex = to.getEndOffset() - modifiedStartOffset[0]; + } + } + fragment.append( encoder.encodeText( src.substring( srcIndex ) ) ); + return fragment.toString(); + } + + protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset, int[] modifiedStartOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + buffer.append( values[index[0]++].stringValue() ); + buffer.append( getMultiValuedSeparator() ); + } + int bufferLength = buffer.length(); + // we added the multi value char to the last buffer, ignore it + if (values[index[0] - 1].fieldType().tokenized()) { + bufferLength--; + } + int eo = bufferLength < endOffset ? bufferLength : boundaryScanner.findEndOffset( buffer, endOffset ); + modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset ); + return buffer.substring( modifiedStartOffset[0], eo ); + } + + protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + buffer.append( values[index[0]].stringValue() ); + buffer.append( multiValuedSeparator ); + index[0]++; + } + int eo = buffer.length() < endOffset ? buffer.length() : endOffset; + return buffer.substring( startOffset, eo ); + } + + protected List discreteMultiValueHighlighting(List fragInfos, Field[] fields) { + Map> fieldNameToFragInfos = new HashMap>(); + for (Field field : fields) { + fieldNameToFragInfos.put(field.name(), new ArrayList()); + } + + fragInfos: for (WeightedFragInfo fragInfo : fragInfos) { + int fieldStart; + int fieldEnd = 0; + for (Field field : fields) { + if (field.stringValue().isEmpty()) { + fieldEnd++; + continue; + } + fieldStart = fieldEnd; + fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name. + + if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart && + fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) { + fieldNameToFragInfos.get(field.name()).add(fragInfo); + continue fragInfos; + } + + if (fragInfo.getSubInfos().isEmpty()) { + continue fragInfos; + } + + Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0); + if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) { + continue; + } + + int fragStart = fieldStart; + if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) { + fragStart = fragInfo.getStartOffset(); + } + + int fragEnd = fieldEnd; + if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) { + fragEnd = fragInfo.getEndOffset(); + } + + + List subInfos = new ArrayList(); + WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, fragInfo.getTotalBoost()); + + Iterator subInfoIterator = fragInfo.getSubInfos().iterator(); + while (subInfoIterator.hasNext()) { + SubInfo subInfo = subInfoIterator.next(); + List toffsList = new ArrayList(); + Iterator toffsIterator = subInfo.getTermsOffsets().iterator(); + while (toffsIterator.hasNext()) { + Toffs toffs = toffsIterator.next(); + if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) { + toffsList.add(toffs); + toffsIterator.remove(); + } + } + if (!toffsList.isEmpty()) { + subInfos.add(new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum())); + } + + if (subInfo.getTermsOffsets().isEmpty()) { + subInfoIterator.remove(); + } + } + fieldNameToFragInfos.get(field.name()).add(weightedFragInfo); + } + } + + List result = new ArrayList(); + for (List weightedFragInfos : fieldNameToFragInfos.values()) { + result.addAll(weightedFragInfos); + } + CollectionUtil.timSort(result, new Comparator() { + + @Override + public int compare(XFieldFragList.WeightedFragInfo info1, XFieldFragList.WeightedFragInfo info2) { + return info1.getStartOffset() - info2.getStartOffset(); + } + + }); + + return result; + } + + public void setMultiValuedSeparator( char separator ){ + multiValuedSeparator = separator; + } + + public char getMultiValuedSeparator(){ + return multiValuedSeparator; + } + + public boolean isDiscreteMultiValueHighlighting() { + return discreteMultiValueHighlighting; + } + + public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) { + this.discreteMultiValueHighlighting = discreteMultiValueHighlighting; + } + + protected String getPreTag( int num ){ + return getPreTag( preTags, num ); + } + + protected String getPostTag( int num ){ + return getPostTag( postTags, num ); + } + + protected String getPreTag( String[] preTags, int num ){ + int n = num % preTags.length; + return preTags[n]; + } + + protected String getPostTag( String[] postTags, int num ){ + int n = num % postTags.length; + return postTags[n]; + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFastVectorHighlighter.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFastVectorHighlighter.java new file mode 100644 index 00000000000..042cb8774f6 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFastVectorHighlighter.java @@ -0,0 +1,223 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.highlight.Encoder; + +import java.io.IOException; + +/** + * Another highlighter implementation. + * + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XFastVectorHighlighter { + + public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true; + public static final boolean DEFAULT_FIELD_MATCH = true; + private final boolean phraseHighlight; + private final boolean fieldMatch; + private final XFragListBuilder fragListBuilder; + private final XFragmentsBuilder fragmentsBuilder; + private int phraseLimit = Integer.MAX_VALUE; + + /** + * the default constructor. + */ + public XFastVectorHighlighter(){ + this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH ); + } + + /** + * a constructor. Using {@link XSimpleFragListBuilder} and {@link XScoreOrderFragmentsBuilder}. + * + * @param phraseHighlight true or false for phrase highlighting + * @param fieldMatch true of false for field matching + */ + public XFastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){ + this( phraseHighlight, fieldMatch, new XSimpleFragListBuilder(), new XScoreOrderFragmentsBuilder() ); + } + + /** + * a constructor. A {@link XFragListBuilder} and a {@link XFragmentsBuilder} can be specified (plugins). + * + * @param phraseHighlight true of false for phrase highlighting + * @param fieldMatch true of false for field matching + * @param fragListBuilder an instance of {@link XFragListBuilder} + * @param fragmentsBuilder an instance of {@link XFragmentsBuilder} + */ + public XFastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch, + XFragListBuilder fragListBuilder, XFragmentsBuilder fragmentsBuilder ){ + this.phraseHighlight = phraseHighlight; + this.fieldMatch = fieldMatch; + this.fragListBuilder = fragListBuilder; + this.fragmentsBuilder = fragmentsBuilder; + } + + /** + * create a {@link XFieldQuery} object. + * + * @param query a query + * @return the created {@link XFieldQuery} object + */ + public XFieldQuery getFieldQuery( Query query ) { + // TODO: should we deprecate this? + // because if there is no reader, then we cannot rewrite MTQ. + try { + return new XFieldQuery( query, null, phraseHighlight, fieldMatch ); + } catch (IOException e) { + // should never be thrown when reader is null + throw new RuntimeException (e); + } + } + + /** + * create a {@link XFieldQuery} object. + * + * @param query a query + * @return the created {@link XFieldQuery} object + */ + public XFieldQuery getFieldQuery( Query query, IndexReader reader ) throws IOException { + return new XFieldQuery( query, reader, phraseHighlight, fieldMatch ); + } + + /** + * return the best fragment. + * + * @param fieldQuery {@link XFieldQuery} object + * @param reader {@link IndexReader} of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fragCharSize the length (number of chars) of a fragment + * @return the best fragment (snippet) string + * @throws IOException If there is a low-level I/O error + */ + public final String getBestFragment( final XFieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize ) throws IOException { + XFieldFragList fieldFragList = + getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize ); + return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList ); + } + + /** + * return the best fragments. + * + * @param fieldQuery {@link XFieldQuery} object + * @param reader {@link IndexReader} of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fragCharSize the length (number of chars) of a fragment + * @param maxNumFragments maximum number of fragments + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException If there is a low-level I/O error + */ + public final String[] getBestFragments( final XFieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize, int maxNumFragments ) throws IOException { + XFieldFragList fieldFragList = + getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize ); + return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments ); + } + + /** + * return the best fragment. + * + * @param fieldQuery {@link XFieldQuery} object + * @param reader {@link IndexReader} of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fragCharSize the length (number of chars) of a fragment + * @param fragListBuilder {@link XFragListBuilder} object + * @param fragmentsBuilder {@link XFragmentsBuilder} object + * @param preTags pre-tags to be used to highlight terms + * @param postTags post-tags to be used to highlight terms + * @param encoder an encoder that generates encoded text + * @return the best fragment (snippet) string + * @throws IOException If there is a low-level I/O error + */ + public final String getBestFragment( final XFieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize, + XFragListBuilder fragListBuilder, XFragmentsBuilder fragmentsBuilder, + String[] preTags, String[] postTags, Encoder encoder ) throws IOException { + XFieldFragList fieldFragList = getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize ); + return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList, preTags, postTags, encoder ); + } + + /** + * return the best fragments. + * + * @param fieldQuery {@link XFieldQuery} object + * @param reader {@link IndexReader} of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fragCharSize the length (number of chars) of a fragment + * @param maxNumFragments maximum number of fragments + * @param fragListBuilder {@link XFragListBuilder} object + * @param fragmentsBuilder {@link XFragmentsBuilder} object + * @param preTags pre-tags to be used to highlight terms + * @param postTags post-tags to be used to highlight terms + * @param encoder an encoder that generates encoded text + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException If there is a low-level I/O error + */ + public final String[] getBestFragments( final XFieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize, int maxNumFragments, + XFragListBuilder fragListBuilder, XFragmentsBuilder fragmentsBuilder, + String[] preTags, String[] postTags, Encoder encoder ) throws IOException { + XFieldFragList fieldFragList = + getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize ); + return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments, + preTags, postTags, encoder ); + } + + private XFieldFragList getFieldFragList( XFragListBuilder fragListBuilder, + final XFieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize ) throws IOException { + XFieldTermStack fieldTermStack = new XFieldTermStack( reader, docId, fieldName, fieldQuery ); + XFieldPhraseList fieldPhraseList = new XFieldPhraseList( fieldTermStack, fieldQuery, phraseLimit ); + return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize ); + } + + /** + * return whether phraseHighlight or not. + * + * @return whether phraseHighlight or not + */ + public boolean isPhraseHighlight(){ return phraseHighlight; } + + /** + * return whether fieldMatch or not. + * + * @return whether fieldMatch or not + */ + public boolean isFieldMatch(){ return fieldMatch; } + + /** + * @return the maximum number of phrases to analyze when searching for the highest-scoring phrase. + */ + public int getPhraseLimit () { return phraseLimit; } + + /** + * set the maximum number of phrases to analyze when searching for the highest-scoring phrase. + * The default is unlimited (Integer.MAX_VALUE). + */ + public void setPhraseLimit (int phraseLimit) { this.phraseLimit = phraseLimit; } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldFragList.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldFragList.java new file mode 100644 index 00000000000..cd4e26ebd56 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldFragList.java @@ -0,0 +1,142 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo; +import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo.Toffs; + +import java.util.ArrayList; +import java.util.List; + +/** + * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class + * to create fragments (snippets). + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public abstract class XFieldFragList { + + private List fragInfos = new ArrayList(); + + /** + * a constructor. + * + * @param fragCharSize the length (number of chars) of a fragment + */ + public XFieldFragList( int fragCharSize ){ + } + + /** + * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos + * + * @param startOffset start offset of the fragment + * @param endOffset end offset of the fragment + * @param phraseInfoList list of WeightedPhraseInfo objects + */ + public abstract void add( int startOffset, int endOffset, List phraseInfoList ); + + /** + * return the list of WeightedFragInfos. + * + * @return fragInfos. + */ + public List getFragInfos() { + return fragInfos; + } + + /** + * List of term offsets + weight for a frag info + */ + public static class WeightedFragInfo { + + private List subInfos; + private float totalBoost; + private int startOffset; + private int endOffset; + + public WeightedFragInfo( int startOffset, int endOffset, List subInfos, float totalBoost ){ + this.startOffset = startOffset; + this.endOffset = endOffset; + this.totalBoost = totalBoost; + this.subInfos = subInfos; + } + + public List getSubInfos(){ + return subInfos; + } + + public float getTotalBoost(){ + return totalBoost; + } + + public int getStartOffset(){ + return startOffset; + } + + public int getEndOffset(){ + return endOffset; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( "subInfos=(" ); + for( SubInfo si : subInfos ) + sb.append( si.toString() ); + sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); + return sb.toString(); + } + + /** + * Represents the list of term offsets for some text + */ + public static class SubInfo { + private final String text; // unnecessary member, just exists for debugging purpose + private final List termsOffsets; // usually termsOffsets.size() == 1, + // but if position-gap > 1 and slop > 0 then size() could be greater than 1 + private int seqnum; + + public SubInfo( String text, List termsOffsets, int seqnum ){ + this.text = text; + this.termsOffsets = termsOffsets; + this.seqnum = seqnum; + } + + public List getTermsOffsets(){ + return termsOffsets; + } + + public int getSeqnum(){ + return seqnum; + } + + public String getText(){ + return text; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( text ).append( '(' ); + for( Toffs to : termsOffsets ) + sb.append( to.toString() ); + sb.append( ')' ); + return sb.toString(); + } + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldPhraseList.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldPhraseList.java new file mode 100644 index 00000000000..07e780119bd --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldPhraseList.java @@ -0,0 +1,261 @@ +package org.apache.lucene.search.vectorhighlight; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldQuery.QueryPhraseMap; +import org.apache.lucene.search.vectorhighlight.XFieldTermStack.TermInfo; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder + * to create a FieldFragList object. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XFieldPhraseList { + + LinkedList phraseList = new LinkedList(); + + /** + * create a FieldPhraseList that has no limit on the number of phrases to analyze + * + * @param fieldTermStack FieldTermStack object + * @param fieldQuery FieldQuery object + */ + public XFieldPhraseList( XFieldTermStack fieldTermStack, XFieldQuery fieldQuery){ + this (fieldTermStack, fieldQuery, Integer.MAX_VALUE); + } + + /** + * return the list of WeightedPhraseInfo. + * + * @return phraseList. + */ + public List getPhraseList() { + return phraseList; + } + + /** + * a constructor. + * + * @param fieldTermStack FieldTermStack object + * @param fieldQuery FieldQuery object + * @param phraseLimit maximum size of phraseList + */ + public XFieldPhraseList( XFieldTermStack fieldTermStack, XFieldQuery fieldQuery, int phraseLimit ){ + final String field = fieldTermStack.getFieldName(); + + QueryPhraseMap qpm = fieldQuery.getRootMap(field); + if (qpm != null) { + LinkedList phraseCandidate = new LinkedList(); + extractPhrases(fieldTermStack.termList, qpm, phraseCandidate, 0); + assert phraseCandidate.size() == 0; + } + } + + void extractPhrases(LinkedList terms, QueryPhraseMap currMap, LinkedList phraseCandidate, int longest) { + if (terms.isEmpty()) { + if (longest > 0) { + addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); + } + return; + } + ArrayList samePositionTerms = new ArrayList(); + do { + samePositionTerms.add(terms.pop()); + } while (!terms.isEmpty() && terms.get(0).getPosition() == samePositionTerms.get(0).getPosition()); + + // try all next terms at the same position + for (TermInfo nextTerm : samePositionTerms) { + QueryPhraseMap nextMap = currMap.getTermMap(nextTerm.getText()); + if (nextMap != null) { + phraseCandidate.add(nextTerm); + int l = longest; + if(nextMap.isValidTermOrPhrase( phraseCandidate ) ){ + l = phraseCandidate.size(); + } + extractPhrases(terms, nextMap, phraseCandidate, l); + phraseCandidate.removeLast(); + } + } + + // ignore the next term + extractPhrases(terms, currMap, phraseCandidate, longest); + + // add terms back + for (TermInfo nextTerm : samePositionTerms) { + terms.push(nextTerm); + } + } + + public void addIfNoOverlap( WeightedPhraseInfo wpi ){ + for( WeightedPhraseInfo existWpi : getPhraseList() ){ + if( existWpi.isOffsetOverlap( wpi ) ) { + // WeightedPhraseInfo.addIfNoOverlap() dumps the second part of, for example, hyphenated words (social-economics). + // The result is that all informations in TermInfo are lost and not available for further operations. + existWpi.getTermsInfos().addAll( wpi.getTermsInfos() ); + return; + } + } + getPhraseList().add( wpi ); + } + + /** + * Represents the list of term offsets and boost for some text + */ + public static class WeightedPhraseInfo { + + private String text; // unnecessary member, just exists for debugging purpose + private List termsOffsets; // usually termsOffsets.size() == 1, + // but if position-gap > 1 and slop > 0 then size() could be greater than 1 + private float boost; // query boost + private int seqnum; + + private ArrayList termsInfos; + + /** + * @return the text + */ + public String getText() { + return text; + } + + /** + * @return the termsOffsets + */ + public List getTermsOffsets() { + return termsOffsets; + } + + /** + * @return the boost + */ + public float getBoost() { + return boost; + } + + /** + * @return the termInfos + */ + public List getTermsInfos() { + return termsInfos; + } + + public WeightedPhraseInfo( List terms, float boost ){ + this( terms, boost, 0 ); + } + + public WeightedPhraseInfo( List terms, float boost, int seqnum ){ + this.boost = boost; + this.seqnum = seqnum; + + // We keep TermInfos for further operations + termsInfos = new ArrayList( terms ); + + termsOffsets = new ArrayList( terms.size() ); + TermInfo ti = terms.get( 0 ); + termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); + if( terms.size() == 1 ){ + text = ti.getText(); + return; + } + StringBuilder sb = new StringBuilder(); + sb.append( ti.getText() ); + int pos = ti.getPosition(); + for( int i = 1; i < terms.size(); i++ ){ + ti = terms.get( i ); + sb.append( ti.getText() ); + if( ti.getPosition() - pos == 1 ){ + Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); + to.setEndOffset( ti.getEndOffset() ); + } + else{ + termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); + } + pos = ti.getPosition(); + } + text = sb.toString(); + } + + public int getStartOffset(){ + return termsOffsets.get( 0 ).startOffset; + } + + public int getEndOffset(){ + return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; + } + + public boolean isOffsetOverlap( WeightedPhraseInfo other ){ + int so = getStartOffset(); + int eo = getEndOffset(); + int oso = other.getStartOffset(); + int oeo = other.getEndOffset(); + if( so <= oso && oso < eo ) return true; + if( so < oeo && oeo <= eo ) return true; + if( oso <= so && so < oeo ) return true; + if( oso < eo && eo <= oeo ) return true; + return false; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( text ).append( '(' ).append( boost ).append( ")(" ); + for( Toffs to : termsOffsets ){ + sb.append( to ); + } + sb.append( ')' ); + return sb.toString(); + } + + /** + * @return the seqnum + */ + public int getSeqnum() { + return seqnum; + } + + /** + * Term offsets (start + end) + */ + public static class Toffs { + private int startOffset; + private int endOffset; + public Toffs( int startOffset, int endOffset ){ + this.startOffset = startOffset; + this.endOffset = endOffset; + } + public void setEndOffset( int endOffset ){ + this.endOffset = endOffset; + } + public int getStartOffset(){ + return startOffset; + } + public int getEndOffset(){ + return endOffset; + } + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); + return sb.toString(); + } + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldQuery.java new file mode 100644 index 00000000000..6545f97206c --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldQuery.java @@ -0,0 +1,520 @@ +package org.apache.lucene.search.vectorhighlight; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.search.vectorhighlight.XFieldTermStack.TermInfo; +import org.apache.lucene.util.SorterTemplate; + +import java.io.IOException; +import java.util.*; + +/** + * FieldQuery breaks down query object into terms/phrases and keeps + * them in a QueryPhraseMap structure. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XFieldQuery { + + final boolean fieldMatch; + + // fieldMatch==true, Map + // fieldMatch==false, Map + Map rootMaps = new HashMap(); + + // fieldMatch==true, Map + // fieldMatch==false, Map + Map> termSetMap = new HashMap>(); + + int termOrPhraseNumber; // used for colored tag support + + // The maximum number of different matching terms accumulated from any one MultiTermQuery + private static final int MAX_MTQ_TERMS = 1024; + + XFieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { + this.fieldMatch = fieldMatch; + Set flatQueries = new LinkedHashSet(); + flatten( query, reader, flatQueries ); + saveTerms( flatQueries, reader ); + Collection expandQueries = expand( flatQueries ); + + for( Query flatQuery : expandQueries ){ + QueryPhraseMap rootMap = getRootMap( flatQuery ); + rootMap.add( flatQuery, reader ); + if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ + PhraseQuery pq = (PhraseQuery)flatQuery; + if( pq.getTerms().length > 1 ){ + for( Term term : pq.getTerms() ) + rootMap.addTerm( term, flatQuery.getBoost() ); + } + } + } + } + + /** For backwards compatibility you can initialize FieldQuery without + * an IndexReader, which is only required to support MultiTermQuery + */ + XFieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException { + this (query, null, phraseHighlight, fieldMatch); + } + + void flatten( Query sourceQuery, IndexReader reader, Collection flatQueries ) throws IOException{ + if( sourceQuery instanceof BooleanQuery ){ + BooleanQuery bq = (BooleanQuery)sourceQuery; + for( BooleanClause clause : bq.getClauses() ){ + if( !clause.isProhibited() ) + flatten( clause.getQuery(), reader, flatQueries ); + } + } else if( sourceQuery instanceof DisjunctionMaxQuery ){ + DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; + for( Query query : dmq ){ + flatten( query, reader, flatQueries ); + } + } + else if( sourceQuery instanceof TermQuery ){ + if( !flatQueries.contains( sourceQuery ) ) + flatQueries.add( sourceQuery ); + } + else if( sourceQuery instanceof PhraseQuery ){ + if( !flatQueries.contains( sourceQuery ) ){ + PhraseQuery pq = (PhraseQuery)sourceQuery; + if( pq.getTerms().length > 1 ) + flatQueries.add( pq ); + else if( pq.getTerms().length == 1 ){ + flatQueries.add( new TermQuery( pq.getTerms()[0] ) ); + } + } + } else if (sourceQuery instanceof ConstantScoreQuery) { + final Query q = ((ConstantScoreQuery) sourceQuery).getQuery(); + if (q != null) { + flatten(q, reader, flatQueries); + } + } else if (sourceQuery instanceof FilteredQuery) { + final Query q = ((FilteredQuery) sourceQuery).getQuery(); + if (q != null) { + flatten(q, reader, flatQueries); + } + } else if (reader != null){ + Query query = sourceQuery; + if (sourceQuery instanceof MultiTermQuery) { + MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone(); + copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS)); + query = copy; + } + Query rewritten = query.rewrite(reader); + if (rewritten != query) { + // only rewrite once and then flatten again - the rewritten query could have a speacial treatment + // if this method is overwritten in a subclass. + flatten(rewritten, reader, flatQueries); + + } + // if the query is already rewritten we discard it + } + // else discard queries + } + + /* + * Create expandQueries from flatQueries. + * + * expandQueries := flatQueries + overlapped phrase queries + * + * ex1) flatQueries={a,b,c} + * => expandQueries={a,b,c} + * ex2) flatQueries={a,"b c","c d"} + * => expandQueries={a,"b c","c d","b c d"} + */ + Collection expand( Collection flatQueries ){ + Set expandQueries = new LinkedHashSet(); + for( Iterator i = flatQueries.iterator(); i.hasNext(); ){ + Query query = i.next(); + i.remove(); + expandQueries.add( query ); + if( !( query instanceof PhraseQuery ) ) continue; + for( Iterator j = flatQueries.iterator(); j.hasNext(); ){ + Query qj = j.next(); + if( !( qj instanceof PhraseQuery ) ) continue; + checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj ); + } + } + return expandQueries; + } + + /* + * Check if PhraseQuery A and B have overlapped part. + * + * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"} + * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"} + * ex3) A="a b", B="c d" => no overlap; expandQueries={} + */ + private void checkOverlap( Collection expandQueries, PhraseQuery a, PhraseQuery b ){ + if( a.getSlop() != b.getSlop() ) return; + Term[] ats = a.getTerms(); + Term[] bts = b.getTerms(); + if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return; + checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() ); + checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() ); + } + + /* + * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries. + * + * ex1) src="a b", dest="c d" => no overlap + * ex2) src="a b", dest="a b c" => no overlap + * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"} + * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"} + * ex5) src="a b c", dest="b c" => no overlap + * ex6) src="a b c", dest="b" => no overlap + * ex7) src="a a a a", dest="a a a" => overlap; + * expandQueries={"a a a a a","a a a a a a"} + * ex8) src="a b c d", dest="b c" => no overlap + */ + private void checkOverlap( Collection expandQueries, Term[] src, Term[] dest, int slop, float boost ){ + // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms + // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() + // converts PhraseQuery to TermQuery) + for( int i = 1; i < src.length; i++ ){ + boolean overlap = true; + for( int j = i; j < src.length; j++ ){ + if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){ + overlap = false; + break; + } + } + if( overlap && src.length - i < dest.length ){ + PhraseQuery pq = new PhraseQuery(); + for( Term srcTerm : src ) + pq.add( srcTerm ); + for( int k = src.length - i; k < dest.length; k++ ){ + pq.add( new Term( src[0].field(), dest[k].text() ) ); + } + pq.setSlop( slop ); + pq.setBoost( boost ); + if(!expandQueries.contains( pq ) ) + expandQueries.add( pq ); + } + } + } + + QueryPhraseMap getRootMap( Query query ){ + String key = getKey( query ); + QueryPhraseMap map = rootMaps.get( key ); + if( map == null ){ + map = new QueryPhraseMap( this ); + rootMaps.put( key, map ); + } + return map; + } + + /* + * Return 'key' string. 'key' is the field name of the Query. + * If not fieldMatch, 'key' will be null. + */ + private String getKey( Query query ){ + if( !fieldMatch ) return null; + if( query instanceof TermQuery ) + return ((TermQuery)query).getTerm().field(); + else if ( query instanceof PhraseQuery ){ + PhraseQuery pq = (PhraseQuery)query; + Term[] terms = pq.getTerms(); + return terms[0].field(); + } + else if (query instanceof MultiTermQuery) { + return ((MultiTermQuery)query).getField(); + } + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } + + /* + * Save the set of terms in the queries to termSetMap. + * + * ex1) q=name:john + * - fieldMatch==true + * termSetMap=Map<"name",Set<"john">> + * - fieldMatch==false + * termSetMap=Map> + * + * ex2) q=name:john title:manager + * - fieldMatch==true + * termSetMap=Map<"name",Set<"john">, + * "title",Set<"manager">> + * - fieldMatch==false + * termSetMap=Map> + * + * ex3) q=name:"john lennon" + * - fieldMatch==true + * termSetMap=Map<"name",Set<"john","lennon">> + * - fieldMatch==false + * termSetMap=Map> + */ + void saveTerms( Collection flatQueries, IndexReader reader ) throws IOException{ + for( Query query : flatQueries ){ + Set termSet = getTermSet( query ); + if( query instanceof TermQuery ) + termSet.add( ((TermQuery)query).getTerm().text() ); + else if( query instanceof PhraseQuery ){ + for( Term term : ((PhraseQuery)query).getTerms() ) + termSet.add( term.text() ); + } + else if (query instanceof MultiTermQuery && reader != null) { + BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader); + for (BooleanClause clause : mtqTerms.getClauses()) { + termSet.add (((TermQuery) clause.getQuery()).getTerm().text()); + } + } + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } + } + + private Set getTermSet( Query query ){ + String key = getKey( query ); + Set set = termSetMap.get( key ); + if( set == null ){ + set = new HashSet(); + termSetMap.put( key, set ); + } + return set; + } + + Set getTermSet( String field ){ + return termSetMap.get( fieldMatch ? field : null ); + } + + /** + * + * @return QueryPhraseMap + */ + public QueryPhraseMap getFieldTermMap( String fieldName, String term ){ + QueryPhraseMap rootMap = getRootMap( fieldName ); + return rootMap == null ? null : rootMap.subMap.get( term ); + } + + /** + * + * @return QueryPhraseMap + */ + public QueryPhraseMap searchPhrase( String fieldName, final List phraseCandidate ){ + QueryPhraseMap root = getRootMap( fieldName ); + if( root == null ) return null; + return root.searchPhrase( phraseCandidate ); + } + + public QueryPhraseMap getRootMap( String fieldName ){ + return rootMaps.get( fieldMatch ? fieldName : null ); + } + + int nextTermOrPhraseNumber(){ + return termOrPhraseNumber++; + } + + /** + * Internal structure of a query for highlighting: represents + * a nested query structure + */ + public static class QueryPhraseMap { + + boolean terminal; + int slop; // valid if terminal == true and phraseHighlight == true + float boost; // valid if terminal == true + int[] positions; // valid if terminal == true + int termOrPhraseNumber; // valid if terminal == true + XFieldQuery fieldQuery; + Map subMap = new HashMap(); + + public QueryPhraseMap( XFieldQuery fieldQuery ){ + this.fieldQuery = fieldQuery; + } + + void addTerm( Term term, float boost ){ + QueryPhraseMap map = getOrNewMap( subMap, term.text() ); + map.markTerminal( boost ); + } + + private QueryPhraseMap getOrNewMap( Map subMap, String term ){ + QueryPhraseMap map = subMap.get( term ); + if( map == null ){ + map = new QueryPhraseMap( fieldQuery ); + subMap.put( term, map ); + } + return map; + } + + void add( Query query, IndexReader reader ) { + if( query instanceof TermQuery ){ + addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); + } + else if( query instanceof PhraseQuery ){ + PhraseQuery pq = (PhraseQuery)query; + final Term[] terms = pq.getTerms(); + final int[] positions = pq.getPositions(); + new SorterTemplate() { + + @Override + protected void swap(int i, int j) { + Term tmpTerm = terms[i]; + terms[i] = terms[j]; + terms[j] = tmpTerm; + + int tmpPos = positions[i]; + positions[i] = positions[j]; + positions[j] = tmpPos; + } + + @Override + protected int compare(int i, int j) { + return positions[i] - positions[j]; + } + + @Override + protected void setPivot(int i) { + throw new UnsupportedOperationException(); + } + + @Override + protected int comparePivot(int j) { + throw new UnsupportedOperationException(); + } + }.mergeSort(0, terms.length - 1); + + addToMap(pq, terms, positions, 0, subMap, pq.getSlop()); + } + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } + + private int numTermsAtSamePosition(int[] positions, int i) { + int numTermsAtSamePosition = 1; + for (int j = i + 1; j < positions.length; ++j) { + if (positions[j] == positions[i]) { + ++numTermsAtSamePosition; + } + } + return numTermsAtSamePosition; + } + + private void addToMap(PhraseQuery pq, Term[] terms, int[] positions, int i, Map map, int slop) { + int numTermsAtSamePosition = numTermsAtSamePosition(positions, i); + for (int j = 0; j < numTermsAtSamePosition; ++j) { + QueryPhraseMap qpm = getOrNewMap(map, terms[i + j].text()); + if (i + numTermsAtSamePosition == terms.length) { + qpm.markTerminal(pq.getSlop(), pq.getBoost(), uniquePositions(positions)); + } else { + addToMap(pq, terms, positions, i + numTermsAtSamePosition, qpm.subMap, slop); + } + } + if (slop > 2 && i + numTermsAtSamePosition < terms.length) { + Term[] otherTerms = Arrays.copyOf(terms, terms.length); + int[] otherPositions = Arrays.copyOf(positions, positions.length); + final int nextTermAtSamePosition = numTermsAtSamePosition(positions, i + numTermsAtSamePosition); + System.arraycopy(terms, i + numTermsAtSamePosition, otherTerms, i, nextTermAtSamePosition); + System.arraycopy(positions, i + numTermsAtSamePosition, otherPositions, i, nextTermAtSamePosition); + System.arraycopy(terms, i, otherTerms, i + nextTermAtSamePosition, numTermsAtSamePosition); + System.arraycopy(positions, i, otherPositions, i + nextTermAtSamePosition, numTermsAtSamePosition); + addToMap(pq, otherTerms, otherPositions, i, map, slop - 2); + } + } + + private int[] uniquePositions(int[] positions) { + int uniqueCount = 1; + for (int i = 1; i < positions.length; ++i) { + if (positions[i] != positions[i - 1]) { + ++uniqueCount; + } + } + if (uniqueCount == positions.length) { + return positions; + } + int[] result = new int[uniqueCount]; + result[0] = positions[0]; + for (int i = 1, j = 1; i < positions.length; ++i) { + if (positions[i] != positions[i - 1]) { + result[j++] = positions[i]; + } + } + return result; + } + + public QueryPhraseMap getTermMap( String term ){ + return subMap.get( term ); + } + + private void markTerminal( float boost ){ + markTerminal( 0, boost, null ); + } + + private void markTerminal( int slop, float boost, int[] positions ){ + if (slop > this.slop || (slop == this.slop && boost > this.boost)) { + this.terminal = true; + this.slop = slop; + this.boost = boost; + this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); + this.positions = positions; + } + } + + public boolean isTerminal(){ + return terminal; + } + + public int getSlop(){ + return slop; + } + + public float getBoost(){ + return boost; + } + + public int getTermOrPhraseNumber(){ + return termOrPhraseNumber; + } + + public QueryPhraseMap searchPhrase( final List phraseCandidate ){ + QueryPhraseMap currMap = this; + for( TermInfo ti : phraseCandidate ){ + currMap = currMap.subMap.get( ti.getText() ); + if( currMap == null ) return null; + } + return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; + } + + public boolean isValidTermOrPhrase( final List phraseCandidate ){ + // check terminal + if( !terminal ) return false; + + // if the candidate is a term, it is valid + if( phraseCandidate.size() == 1 ) return true; + + + assert phraseCandidate.size() == positions.length; + // else check whether the candidate is valid phrase + // compare position-gaps between terms to slop + int pos = phraseCandidate.get( 0 ).getPosition(); + int totalDistance = 0; + for( int i = 1; i < phraseCandidate.size(); i++ ){ + int nextPos = phraseCandidate.get( i ).getPosition(); + final int expectedDelta = positions[i] - positions[i - 1]; + final int actualDelta = nextPos - pos; + totalDistance += Math.abs(expectedDelta - actualDelta); + pos = nextPos; + } + return totalDistance <= slop; + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldTermStack.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldTermStack.java new file mode 100644 index 00000000000..0b7df36f882 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldTermStack.java @@ -0,0 +1,209 @@ +package org.apache.lucene.search.vectorhighlight; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.UnicodeUtil; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * FieldTermStack is a stack that keeps query terms in the specified field + * of the document to be highlighted. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XFieldTermStack { + + private final String fieldName; + LinkedList termList = new LinkedList(); + + //public static void main( String[] args ) throws Exception { + // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); + // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); + // Query query = parser.parse( "a x:b" ); + // FieldQuery fieldQuery = new FieldQuery( query, true, false ); + + // Directory dir = new RAMDirectory(); + // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); + // Document doc = new Document(); + // FieldType ft = new FieldType(TextField.TYPE_STORED); + // ft.setStoreTermVectors(true); + // ft.setStoreTermVectorOffsets(true); + // ft.setStoreTermVectorPositions(true); + // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); + // doc.add( new Field( "f", ft, "b a b a f" ) ); + // writer.addDocument( doc ); + // writer.close(); + + // IndexReader reader = IndexReader.open(dir1); + // new FieldTermStack( reader, 0, "f", fieldQuery ); + // reader.close(); + //} + + /** + * a constructor. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fieldQuery FieldQuery object + * @throws IOException If there is a low-level I/O error + */ + public XFieldTermStack( IndexReader reader, int docId, String fieldName, final XFieldQuery fieldQuery ) throws IOException { + this.fieldName = fieldName; + + Set termSet = fieldQuery.getTermSet( fieldName ); + // just return to make null snippet if un-matched fieldName specified when fieldMatch == true + if( termSet == null ) return; + + final Fields vectors = reader.getTermVectors(docId); + if (vectors == null) { + // null snippet + return; + } + + final Terms vector = vectors.terms(fieldName); + if (vector == null) { + // null snippet + return; + } + + final CharsRef spare = new CharsRef(); + final TermsEnum termsEnum = vector.iterator(null); + DocsAndPositionsEnum dpEnum = null; + BytesRef text; + + int numDocs = reader.maxDoc(); + + final List termList = new ArrayList(); + while ((text = termsEnum.next()) != null) { + UnicodeUtil.UTF8toUTF16(text, spare); + final String term = spare.toString(); + if (!termSet.contains(term)) { + continue; + } + dpEnum = termsEnum.docsAndPositions(null, dpEnum); + if (dpEnum == null) { + // null snippet + return; + } + + dpEnum.nextDoc(); + + // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html + final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 ); + + final int freq = dpEnum.freq(); + + for(int i = 0;i < freq;i++) { + int pos = dpEnum.nextPosition(); + if (dpEnum.startOffset() < 0) { + return; // no offsets, null snippet + } + termList.add( new TermInfo( term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight ) ); + } + } + + // sort by position + CollectionUtil.timSort(termList); + this.termList.addAll(termList); + } + + /** + * @return field name + */ + public String getFieldName(){ + return fieldName; + } + + /** + * @return the top TermInfo object of the stack + */ + public TermInfo pop(){ + return termList.poll(); + } + + /** + * Return the top TermInfo object of the stack without removing it. + */ + public TermInfo peek() { + return termList.peek(); + } + + /** + * @param termInfo the TermInfo object to be put on the top of the stack + */ + public void push( TermInfo termInfo ){ + termList.push( termInfo ); + } + + /** + * to know whether the stack is empty + * + * @return true if the stack is empty, false if not + */ + public boolean isEmpty(){ + return termList == null || termList.size() == 0; + } + + /** + * Single term with its position/offsets in the document and IDF weight + */ + public static class TermInfo implements Comparable{ + + private final String text; + private final int startOffset; + private final int endOffset; + private final int position; + + // IDF-weight of this term + private final float weight; + + public TermInfo( String text, int startOffset, int endOffset, int position, float weight ){ + this.text = text; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.position = position; + this.weight = weight; + } + + public String getText(){ return text; } + public int getStartOffset(){ return startOffset; } + public int getEndOffset(){ return endOffset; } + public int getPosition(){ return position; } + public float getWeight(){ return weight; } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' ); + return sb.toString(); + } + + @Override + public int compareTo( TermInfo o ){ + return ( this.position - o.position ); + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFragListBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFragListBuilder.java new file mode 100644 index 00000000000..2daee6f6fd4 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFragListBuilder.java @@ -0,0 +1,35 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * FragListBuilder is an interface for FieldFragList builder classes. + * A FragListBuilder class can be plugged in to Highlighter. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public interface XFragListBuilder { + + /** + * create a FieldFragList. + * + * @param fieldPhraseList FieldPhraseList object + * @param fragCharSize the length (number of chars) of a fragment + * @return the created FieldFragList object + */ + public XFieldFragList createFieldFragList( XFieldPhraseList fieldPhraseList, int fragCharSize ); +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XFragmentsBuilder.java new file mode 100644 index 00000000000..306b83722d3 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFragmentsBuilder.java @@ -0,0 +1,96 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.highlight.Encoder; + +import java.io.IOException; + +/** + * {@link org.apache.lucene.search.vectorhighlight.XFragmentsBuilder} is an interface for fragments (snippets) builder classes. + * A {@link org.apache.lucene.search.vectorhighlight.XFragmentsBuilder} class can be plugged in to + * {@link org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter}. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public interface XFragmentsBuilder { + + /** + * create a fragment. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fieldFragList FieldFragList object + * @return a created fragment or null when no fragment created + * @throws IOException If there is a low-level I/O error + */ + public String createFragment( IndexReader reader, int docId, String fieldName, + XFieldFragList fieldFragList ) throws IOException; + + /** + * create multiple fragments. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighter + * @param fieldName field of the document to be highlighted + * @param fieldFragList FieldFragList object + * @param maxNumFragments maximum number of fragments + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException If there is a low-level I/O error + */ + public String[] createFragments( IndexReader reader, int docId, String fieldName, + XFieldFragList fieldFragList, int maxNumFragments ) throws IOException; + + /** + * create a fragment. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fieldFragList FieldFragList object + * @param preTags pre-tags to be used to highlight terms + * @param postTags post-tags to be used to highlight terms + * @param encoder an encoder that generates encoded text + * @return a created fragment or null when no fragment created + * @throws IOException If there is a low-level I/O error + */ + public String createFragment( IndexReader reader, int docId, String fieldName, + XFieldFragList fieldFragList, String[] preTags, String[] postTags, + Encoder encoder ) throws IOException; + + /** + * create multiple fragments. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighter + * @param fieldName field of the document to be highlighted + * @param fieldFragList FieldFragList object + * @param maxNumFragments maximum number of fragments + * @param preTags pre-tags to be used to highlight terms + * @param postTags post-tags to be used to highlight terms + * @param encoder an encoder that generates encoded text + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException If there is a low-level I/O error + */ + public String[] createFragments( IndexReader reader, int docId, String fieldName, + XFieldFragList fieldFragList, int maxNumFragments, String[] preTags, String[] postTags, + Encoder encoder ) throws IOException; +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XScoreOrderFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XScoreOrderFragmentsBuilder.java new file mode 100644 index 00000000000..9c2fb980298 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XScoreOrderFragmentsBuilder.java @@ -0,0 +1,84 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; +import org.apache.lucene.util.CollectionUtil; + +import java.util.Comparator; +import java.util.List; + +/** + * An implementation of FragmentsBuilder that outputs score-order fragments. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XScoreOrderFragmentsBuilder extends XBaseFragmentsBuilder { + + /** + * a constructor. + */ + public XScoreOrderFragmentsBuilder(){ + super(); + } + + /** + * a constructor. + * + * @param preTags array of pre-tags for markup terms. + * @param postTags array of post-tags for markup terms. + */ + public XScoreOrderFragmentsBuilder( String[] preTags, String[] postTags ){ + super( preTags, postTags ); + } + + public XScoreOrderFragmentsBuilder( BoundaryScanner bs ){ + super( bs ); + } + + public XScoreOrderFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner bs ){ + super( preTags, postTags, bs ); + } + + /** + * Sort by score the list of WeightedFragInfo + */ + @Override + public List getWeightedFragInfoList( List src ) { + CollectionUtil.timSort( src, new ScoreComparator() ); + return src; + } + + /** + * Comparator for {@link WeightedFragInfo} by boost, breaking ties + * by offset. + */ + public static class ScoreComparator implements Comparator { + + @Override + public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) { + if( o1.getTotalBoost() > o2.getTotalBoost() ) return -1; + else if( o1.getTotalBoost() < o2.getTotalBoost() ) return 1; + // if same score then check startOffset + else{ + if( o1.getStartOffset() < o2.getStartOffset() ) return -1; + else if( o1.getStartOffset() > o2.getStartOffset() ) return 1; + } + return 0; + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFieldFragList.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFieldFragList.java new file mode 100644 index 00000000000..8d7bfd52c7f --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFieldFragList.java @@ -0,0 +1,55 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo; + +import java.util.ArrayList; +import java.util.List; + +/** + * A simple implementation of {@link XFieldFragList}. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XSimpleFieldFragList extends XFieldFragList { + + /** + * a constructor. + * + * @param fragCharSize the length (number of chars) of a fragment + */ + public XSimpleFieldFragList( int fragCharSize ) { + super( fragCharSize ); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List phraseInfoList ) + */ + @Override + public void add( int startOffset, int endOffset, List phraseInfoList ) { + float totalBoost = 0; + List subInfos = new ArrayList(); + for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ + subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); + totalBoost += phraseInfo.getBoost(); + } + getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); + } + +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragListBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragListBuilder.java new file mode 100644 index 00000000000..e9450b6cbae --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragListBuilder.java @@ -0,0 +1,43 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * A simple implementation of {@link XFragListBuilder}. + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XSimpleFragListBuilder extends XBaseFragListBuilder { + + public XSimpleFragListBuilder() { + super(); + } + + public XSimpleFragListBuilder(int margin) { + super(margin); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) + */ + @Override + public XFieldFragList createFieldFragList( XFieldPhraseList fieldPhraseList, int fragCharSize ){ + return createFieldFragList( fieldPhraseList, new XSimpleFieldFragList( fragCharSize ), fragCharSize ); + } + +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragmentsBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragmentsBuilder.java new file mode 100644 index 00000000000..684ee2c22d6 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XSimpleFragmentsBuilder.java @@ -0,0 +1,63 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; + +import java.util.List; + +/** + * A simple implementation of FragmentsBuilder. + * + */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XSimpleFragmentsBuilder extends XBaseFragmentsBuilder { + + /** + * a constructor. + */ + public XSimpleFragmentsBuilder() { + super(); + } + + /** + * a constructor. + * + * @param preTags array of pre-tags for markup terms. + * @param postTags array of post-tags for markup terms. + */ + public XSimpleFragmentsBuilder( String[] preTags, String[] postTags ) { + super( preTags, postTags ); + } + + public XSimpleFragmentsBuilder( BoundaryScanner bs ) { + super( bs ); + } + + public XSimpleFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner bs ) { + super( preTags, postTags, bs ); + } + + /** + * do nothing. return the source list. + */ + @Override + public List getWeightedFragInfoList( List src ) { + return src; + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/XSingleFragListBuilder.java b/src/main/java/org/apache/lucene/search/vectorhighlight/XSingleFragListBuilder.java new file mode 100644 index 00000000000..30a19462add --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XSingleFragListBuilder.java @@ -0,0 +1,60 @@ +package org.apache.lucene.search.vectorhighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * An implementation class of {@link XFragListBuilder} that generates one {@link WeightedFragInfo} object. + * Typical use case of this class is that you can get an entire field contents + * by using both of this class and {@link XSimpleFragmentsBuilder}.
+ *

+ * FastVectorHighlighter h = new FastVectorHighlighter( true, true,
+ *   new SingleFragListBuilder(), new SimpleFragmentsBuilder() );
+ * 
+ */ +//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT +public class XSingleFragListBuilder implements XFragListBuilder { + + @Override + public XFieldFragList createFieldFragList(XFieldPhraseList fieldPhraseList, + int fragCharSize) { + + XFieldFragList ffl = new XSimpleFieldFragList( fragCharSize ); + + List wpil = new ArrayList(); + Iterator ite = fieldPhraseList.phraseList.iterator(); + WeightedPhraseInfo phraseInfo = null; + while( true ){ + if( !ite.hasNext() ) break; + phraseInfo = ite.next(); + if( phraseInfo == null ) break; + + wpil.add( phraseInfo ); + } + if( wpil.size() > 0 ) + ffl.add( 0, Integer.MAX_VALUE, wpil ); + return ffl; + } + +} diff --git a/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java b/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java index 7a92ae7e327..92f73bf733a 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/vectorhighlight/SimpleBoundaryScanner2.java @@ -4,7 +4,7 @@ import gnu.trove.set.hash.TCharHashSet; import org.apache.lucene.search.vectorhighlight.BoundaryScanner; /** - * A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}. + * A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.XSimpleBoundaryScanner}. *

* Uses specialized char set to lookup boundary, and fixes a problem with start offset in the * beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem diff --git a/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java b/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java index 10c686f50d7..a1b4244f493 100644 --- a/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java +++ b/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java @@ -18,20 +18,11 @@ */ package org.elasticsearch.search.highlight; -import java.util.Map; - +import com.google.common.collect.Maps; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.SimpleHTMLEncoder; -import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder; -import org.apache.lucene.search.vectorhighlight.BoundaryScanner; -import org.apache.lucene.search.vectorhighlight.CustomFieldQuery; -import org.apache.lucene.search.vectorhighlight.FieldQuery; -import org.apache.lucene.search.vectorhighlight.FragListBuilder; -import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; -import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; -import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder; -import org.apache.lucene.search.vectorhighlight.SingleFragListBuilder; +import org.apache.lucene.search.vectorhighlight.*; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2; @@ -45,7 +36,7 @@ import org.elasticsearch.search.highlight.vectorhighlight.SourceScoreOrderFragme import org.elasticsearch.search.highlight.vectorhighlight.SourceSimpleFragmentsBuilder; import org.elasticsearch.search.internal.SearchContext; -import com.google.common.collect.Maps; +import java.util.Map; /** * @@ -85,10 +76,10 @@ public class FastVectorHighlighter implements Highlighter { try { MapperHighlightEntry entry = cache.mappers.get(mapper); - FieldQuery fieldQuery = null; + XFieldQuery fieldQuery = null; if (entry == null) { - FragListBuilder fragListBuilder; - BaseFragmentsBuilder fragmentsBuilder; + XFragListBuilder fragListBuilder; + XBaseFragmentsBuilder fragmentsBuilder; BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT; if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) { @@ -96,7 +87,7 @@ public class FastVectorHighlighter implements Highlighter { } if (field.numberOfFragments() == 0) { - fragListBuilder = new SingleFragListBuilder(); + fragListBuilder = new XSingleFragListBuilder(); if (mapper.fieldType().stored()) { fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.preTags(), field.postTags(), boundaryScanner); @@ -104,10 +95,10 @@ public class FastVectorHighlighter implements Highlighter { fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); } } else { - fragListBuilder = field.fragmentOffset() == -1 ? new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fragmentOffset()); + fragListBuilder = field.fragmentOffset() == -1 ? new XSimpleFragListBuilder() : new XSimpleFragListBuilder(field.fragmentOffset()); if (field.scoreOrdered()) { if (mapper.fieldType().stored()) { - fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); + fragmentsBuilder = new XScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); } else { fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); } @@ -127,7 +118,7 @@ public class FastVectorHighlighter implements Highlighter { // parameters to FVH are not requires since: // first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch) // fragment builders are used explicitly - cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter(); + cache.fvh = new org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter(); } CustomFieldQuery.highlightFilters.set(field.highlightFilter()); if (field.requireFieldMatch()) { @@ -166,16 +157,16 @@ public class FastVectorHighlighter implements Highlighter { } private class MapperHighlightEntry { - public FragListBuilder fragListBuilder; - public FragmentsBuilder fragmentsBuilder; + public XFragListBuilder fragListBuilder; + public XFragmentsBuilder fragmentsBuilder; public org.apache.lucene.search.highlight.Highlighter highlighter; } private class HighlighterEntry { - public org.apache.lucene.search.vectorhighlight.FastVectorHighlighter fvh; - public FieldQuery noFieldMatchFieldQuery; - public FieldQuery fieldMatchFieldQuery; + public org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter fvh; + public XFieldQuery noFieldMatchFieldQuery; + public XFieldQuery fieldMatchFieldQuery; public Map mappers = Maps.newHashMap(); } diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java index a036c3abfdf..e306dc945cb 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/FragmentBuilderHelper.java @@ -22,10 +22,10 @@ package org.elasticsearch.search.highlight.vectorhighlight; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Field; -import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; -import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; +import org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter; +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.XFragmentsBuilder; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.Version; import org.elasticsearch.index.analysis.*; @@ -35,7 +35,7 @@ import java.util.Comparator; import java.util.List; /** - * Simple helper class for {@link FastVectorHighlighter} {@link FragmentsBuilder} implemenations. + * Simple helper class for {@link XFastVectorHighlighter} {@link XFragmentsBuilder} implemenations. */ public final class FragmentBuilderHelper { @@ -45,7 +45,7 @@ public final class FragmentBuilderHelper { /** * Fixes problems with broken analysis chains if positions and offsets are messed up that can lead to - * {@link StringIndexOutOfBoundsException} in the {@link FastVectorHighlighter} + * {@link StringIndexOutOfBoundsException} in the {@link XFastVectorHighlighter} */ public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) { assert fragInfo != null : "FragInfo must not be null"; diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SimpleFragmentsBuilder.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SimpleFragmentsBuilder.java index d747eafd681..113e9832683 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SimpleFragmentsBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SimpleFragmentsBuilder.java @@ -21,14 +21,14 @@ package org.elasticsearch.search.highlight.vectorhighlight; import org.apache.lucene.document.Field; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.vectorhighlight.BoundaryScanner; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; import org.elasticsearch.index.mapper.FieldMapper; /** * Direct Subclass of Lucene's org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder * that corrects offsets for broken analysis chains. */ -public class SimpleFragmentsBuilder extends org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder { +public class SimpleFragmentsBuilder extends org.apache.lucene.search.vectorhighlight.XSimpleFragmentsBuilder { protected final FieldMapper mapper; public SimpleFragmentsBuilder(FieldMapper mapper, diff --git a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java index 72d3c24763d..9bf3e53c109 100644 --- a/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java @@ -19,32 +19,25 @@ package org.elasticsearch.search.highlight.vectorhighlight; -import java.io.IOException; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.ngram.NGramTokenizerFactory; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.vectorhighlight.BoundaryScanner; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; -import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; -import org.elasticsearch.index.analysis.CustomAnalyzer; -import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.XScoreOrderFragmentsBuilder; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.lookup.SearchLookup; +import java.io.IOException; +import java.util.List; + /** * */ -public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder { +public class SourceScoreOrderFragmentsBuilder extends XScoreOrderFragmentsBuilder { private final FieldMapper mapper; diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index 48886f4ccfc..b362499305b 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -28,11 +28,9 @@ import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings.Builder; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.index.query.FilterBuilders; -import org.elasticsearch.index.query.MatchQueryBuilder; +import org.elasticsearch.index.query.*; import org.elasticsearch.index.query.MatchQueryBuilder.Operator; import org.elasticsearch.index.query.MatchQueryBuilder.Type; -import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; @@ -1081,70 +1079,6 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest { } - @Test - public void testDisableFastVectorHighlighter() throws Exception { - client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2)) - .addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties") - .startObject("title").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").endObject() - .endObject().endObject().endObject()) - .execute().actionGet(); - ensureGreen(); - - for (int i = 0; i < 5; i++) { - client().prepareIndex("test", "type1", Integer.toString(i)) - .setSource("title", "This is a test for the workaround for the fast vector highlighting SOLR-3724").execute().actionGet(); - } - refresh(); - SearchResponse search = client().prepareSearch() - .setQuery(matchPhraseQuery("title", "test for the workaround")) - .addHighlightedField("title", 50, 1, 10) - .execute().actionGet(); - - assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0)); - - assertThat(search.getHits().totalHits(), equalTo(5l)); - assertThat(search.getHits().hits().length, equalTo(5)); - - for (SearchHit hit : search.getHits()) { - // Because of SOLR-3724 nothing is highlighted when FVH is used - assertThat(hit.highlightFields().isEmpty(), equalTo(true)); - } - - // Using plain highlighter instead of FVH - search = client().prepareSearch() - .setQuery(matchPhraseQuery("title", "test for the workaround")) - .addHighlightedField("title", 50, 1, 10) - .setHighlighterType("highlighter") - .execute().actionGet(); - - assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0)); - - assertThat(search.getHits().totalHits(), equalTo(5l)); - assertThat(search.getHits().hits().length, equalTo(5)); - - for (SearchHit hit : search.getHits()) { - // With plain highlighter terms are highlighted correctly - assertThat(hit.highlightFields().get("title").fragments()[0].string(), equalTo("This is a test for the workaround for the fast vector highlighting SOLR-3724")); - } - - // Using plain highlighter instead of FVH on the field level - search = client().prepareSearch() - .setQuery(matchPhraseQuery("title", "test for the workaround")) - .addHighlightedField(new HighlightBuilder.Field("title").highlighterType("highlighter")) - .setHighlighterType("highlighter") - .execute().actionGet(); - - assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0)); - - assertThat(search.getHits().totalHits(), equalTo(5l)); - assertThat(search.getHits().hits().length, equalTo(5)); - - for (SearchHit hit : search.getHits()) { - // With plain highlighter terms are highlighted correctly - assertThat(hit.highlightFields().get("title").fragments()[0].string(), equalTo("This is a test for the workaround for the fast vector highlighting SOLR-3724")); - } - } - @Test public void testFSHHighlightAllMvFragments() throws Exception { client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder() @@ -1534,4 +1468,54 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest { assertThat(response.getFailedShards(), equalTo(0)); } + @Test + public void testHighlightComplexPhraseQuery() throws Exception { + prepareCreate("test") + .setSettings(ImmutableSettings.builder() + .put("analysis.analyzer.code.type", "custom") + .put("analysis.analyzer.code.tokenizer", "code") + .put("analysis.analyzer.code.filter", "code,lowercase") + .put("analysis.tokenizer.code.type", "pattern") + .put("analysis.tokenizer.code.pattern", "[.,:;/\"<>(){}\\[\\]\\s]") + .put("analysis.filter.code.type", "word_delimiter") + .put("analysis.filter.code.generate_word_parts", "true") + .put("analysis.filter.code.generate_number_parts", "true") + .put("analysis.filter.code.catenate_words", "false") + .put("analysis.filter.code.catenate_numbers", "false") + .put("analysis.filter.code.catenate_all", "false") + .put("analysis.filter.code.split_on_case_change", "true") + .put("analysis.filter.code.preserve_original", "true") + .put("analysis.filter.code.split_on_numerics", "true") + .put("analysis.filter.code.stem_english_possessive", "false") + .build()) + .addMapping("type", jsonBuilder() + .startObject() + .startObject("type") + .startObject("properties") + .startObject("text") + .field("type", "string") + .field("analyzer", "code") + .field("term_vector", "with_positions_offsets") + .endObject() + .endObject() + .endObject() + .endObject()) + .execute().actionGet(); + + ensureGreen(); + client().prepareIndex("test", "type", "1") + .setSource(jsonBuilder().startObject() + .field("text", "def log_worker_status( worker )\n pass") + .endObject()) + .setRefresh(true) + .execute().actionGet(); + + SearchResponse response = client().prepareSearch("test") + .setQuery(QueryBuilders.matchPhraseQuery("text", "def log_worker_status( worker )")) + .addHighlightedField("text").execute().actionGet(); + assertThat(response.getFailedShards(), equalTo(0)); + assertThat(response.getHits().totalHits(), equalTo(1L)); + assertThat(response.getHits().getAt(0).getHighlightFields().get("text").fragments()[0].string(), equalTo("def log_worker_status( worker )\n pass")); + } + } diff --git a/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java b/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java index f253594fe15..f68306bc9cc 100644 --- a/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java +++ b/src/test/java/org/elasticsearch/test/unit/deps/lucene/VectorHighlighterTests.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; import org.apache.lucene.search.vectorhighlight.CustomFieldQuery; -import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; +import org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.elasticsearch.common.lucene.Lucene; @@ -60,7 +60,7 @@ public class VectorHighlighterTests { assertThat(topDocs.totalHits, equalTo(1)); - FastVectorHighlighter highlighter = new FastVectorHighlighter(); + XFastVectorHighlighter highlighter = new XFastVectorHighlighter(); String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), reader, topDocs.scoreDocs[0].doc, "content", 30); assertThat(fragment, notNullValue()); @@ -83,7 +83,7 @@ public class VectorHighlighterTests { assertThat(topDocs.totalHits, equalTo(1)); - FastVectorHighlighter highlighter = new FastVectorHighlighter(); + XFastVectorHighlighter highlighter = new XFastVectorHighlighter(); PrefixQuery prefixQuery = new PrefixQuery(new Term("content", "ba")); assertThat(prefixQuery.getRewriteMethod().getClass().getName(), equalTo(PrefixQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT.getClass().getName())); @@ -125,7 +125,7 @@ public class VectorHighlighterTests { assertThat(topDocs.totalHits, equalTo(1)); - FastVectorHighlighter highlighter = new FastVectorHighlighter(); + XFastVectorHighlighter highlighter = new XFastVectorHighlighter(); String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), reader, topDocs.scoreDocs[0].doc, "content", 30); assertThat(fragment, nullValue()); @@ -147,7 +147,7 @@ public class VectorHighlighterTests { assertThat(topDocs.totalHits, equalTo(1)); - FastVectorHighlighter highlighter = new FastVectorHighlighter(); + XFastVectorHighlighter highlighter = new XFastVectorHighlighter(); String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), reader, topDocs.scoreDocs[0].doc, "content", 30); assertThat(fragment, nullValue());