Add FastVectorHighlighter support for more complex queries.

FastVectorHighlighter fails at highlighting some complex queries such as
multi phrase queries which have two terms at the same position. This can be
easily triggered by running a `match_phrase` query with an analyzer which
outputs synonyms such as SynonymFilter or WordDelimiterFilter.

Close #3357
This commit is contained in:
Adrien Grand 2013-07-19 12:21:24 +02:00
parent 6b21414520
commit e943cc81a5
22 changed files with 2355 additions and 120 deletions

View File

@ -40,7 +40,7 @@ import java.util.List;
* *
*/ */
// LUCENE MONITOR // LUCENE MONITOR
public class CustomFieldQuery extends FieldQuery { public class CustomFieldQuery extends XFieldQuery {
private static Field multiTermQueryWrapperFilterQueryField; private static Field multiTermQueryWrapperFilterQueryField;
@ -55,7 +55,7 @@ public class CustomFieldQuery extends FieldQuery {
public static final ThreadLocal<Boolean> highlightFilters = new ThreadLocal<Boolean>(); public static final ThreadLocal<Boolean> highlightFilters = new ThreadLocal<Boolean>();
public CustomFieldQuery(Query query, IndexReader reader, FastVectorHighlighter highlighter) throws IOException { public CustomFieldQuery(Query query, IndexReader reader, XFastVectorHighlighter highlighter) throws IOException {
this(query, reader, highlighter.isPhraseHighlight(), highlighter.isFieldMatch()); this(query, reader, highlighter.isPhraseHighlight(), highlighter.isFieldMatch());
} }

View File

@ -0,0 +1,144 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* A abstract implementation of {@link XFragListBuilder}.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public abstract class XBaseFragListBuilder implements XFragListBuilder {
public static final int MARGIN_DEFAULT = 6;
public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3;
final int margin;
final int minFragCharSize;
public XBaseFragListBuilder( int margin ){
if( margin < 0 )
throw new IllegalArgumentException( "margin(" + margin + ") is too small. It must be 0 or higher." );
this.margin = margin;
this.minFragCharSize = Math.max( 1, margin * MIN_FRAG_CHAR_SIZE_FACTOR );
}
public XBaseFragListBuilder(){
this( MARGIN_DEFAULT );
}
protected XFieldFragList createFieldFragList( XFieldPhraseList fieldPhraseList, XFieldFragList fieldFragList, int fragCharSize ){
if( fragCharSize < minFragCharSize )
throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." );
List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>();
IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<WeightedPhraseInfo>(fieldPhraseList.getPhraseList().iterator());
WeightedPhraseInfo phraseInfo = null;
int startOffset = 0;
while((phraseInfo = queue.top()) != null){
// if the phrase violates the border of previous fragment, discard it and try next phrase
if( phraseInfo.getStartOffset() < startOffset ) {
queue.removeTop();
continue;
}
wpil.clear();
final int currentPhraseStartOffset = phraseInfo.getStartOffset();
int currentPhraseEndOffset = phraseInfo.getEndOffset();
int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset);
int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize);
if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
wpil.add(phraseInfo);
}
while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd
if (phraseInfo.getEndOffset() <= spanEnd) {
currentPhraseEndOffset = phraseInfo.getEndOffset();
if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
wpil.add(phraseInfo);
}
} else {
break;
}
}
if (wpil.isEmpty()) {
continue;
}
final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset;
// now recalculate the start and end position to "center" the result
final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here
spanStart = currentPhraseStartOffset - newMargin;
if (spanStart < startOffset) {
spanStart = startOffset;
}
// whatever is bigger here we grow this out
spanEnd = spanStart + Math.max(matchLen, fragCharSize);
startOffset = spanEnd;
fieldFragList.add(spanStart, spanEnd, wpil);
}
return fieldFragList;
}
/**
* A predicate to decide if the given {@link WeightedPhraseInfo} should be
* accepted as a highlighted phrase or if it should be discarded.
* <p>
* The default implementation discards phrases that are composed of more than one term
* and where the matchLength exceeds the fragment character size.
*
* @param info the phrase info to accept
* @param matchLength the match length of the current phrase
* @param fragCharSize the configured fragment character size
* @return <code>true</code> if this phrase info should be accepted as a highligh phrase
*/
protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) {
return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize;
}
private static final class IteratorQueue<T> {
private final Iterator<T> iter;
private T top;
public IteratorQueue(Iterator<T> iter) {
this.iter = iter;
T removeTop = removeTop();
assert removeTop == null;
}
public T top() {
return top;
}
public T removeTop() {
T currentTop = top;
if (iter.hasNext()) {
top = iter.next();
} else {
top = null;
}
return currentTop;
}
}
}

View File

@ -0,0 +1,332 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo.Toffs;
import org.apache.lucene.util.CollectionUtil;
import java.io.IOException;
import java.util.*;
/**
* Base FragmentsBuilder implementation that supports colored pre/post
* tags and multivalued fields.
* <p>
* Uses {@link XBoundaryScanner} to determine fragments.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public abstract class XBaseFragmentsBuilder implements XFragmentsBuilder {
protected String[] preTags, postTags;
public static final String[] COLORED_PRE_TAGS = {
"<b style=\"background:yellow\">", "<b style=\"background:lawngreen\">", "<b style=\"background:aquamarine\">",
"<b style=\"background:magenta\">", "<b style=\"background:palegreen\">", "<b style=\"background:coral\">",
"<b style=\"background:wheat\">", "<b style=\"background:khaki\">", "<b style=\"background:lime\">",
"<b style=\"background:deepskyblue\">", "<b style=\"background:deeppink\">", "<b style=\"background:salmon\">",
"<b style=\"background:peachpuff\">", "<b style=\"background:violet\">", "<b style=\"background:mediumpurple\">",
"<b style=\"background:palegoldenrod\">", "<b style=\"background:darkkhaki\">", "<b style=\"background:springgreen\">",
"<b style=\"background:turquoise\">", "<b style=\"background:powderblue\">"
};
public static final String[] COLORED_POST_TAGS = { "</b>" };
private char multiValuedSeparator = ' ';
private final BoundaryScanner boundaryScanner;
private boolean discreteMultiValueHighlighting = false;
protected XBaseFragmentsBuilder(){
this( new String[]{ "<b>" }, new String[]{ "</b>" } );
}
protected XBaseFragmentsBuilder( String[] preTags, String[] postTags ){
this(preTags, postTags, new SimpleBoundaryScanner());
}
protected XBaseFragmentsBuilder(BoundaryScanner boundaryScanner){
this( new String[]{ "<b>" }, new String[]{ "</b>" }, boundaryScanner );
}
protected XBaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){
this.preTags = preTags;
this.postTags = postTags;
this.boundaryScanner = boundaryScanner;
}
static Object checkTagsArgument( Object tags ){
if( tags instanceof String ) return tags;
else if( tags instanceof String[] ) return tags;
throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
}
public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src );
private static final Encoder NULL_ENCODER = new DefaultEncoder();
@Override
public String createFragment( IndexReader reader, int docId,
String fieldName, XFieldFragList fieldFragList ) throws IOException {
return createFragment( reader, docId, fieldName, fieldFragList,
preTags, postTags, NULL_ENCODER );
}
@Override
public String[] createFragments( IndexReader reader, int docId,
String fieldName, XFieldFragList fieldFragList, int maxNumFragments )
throws IOException {
return createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments,
preTags, postTags, NULL_ENCODER );
}
@Override
public String createFragment( IndexReader reader, int docId,
String fieldName, XFieldFragList fieldFragList, String[] preTags, String[] postTags,
Encoder encoder ) throws IOException {
String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1,
preTags, postTags, encoder );
if( fragments == null || fragments.length == 0 ) return null;
return fragments[0];
}
@Override
public String[] createFragments( IndexReader reader, int docId,
String fieldName, XFieldFragList fieldFragList, int maxNumFragments,
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
if( maxNumFragments < 0 ) {
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
}
List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
Field[] values = getFields( reader, docId, fieldName );
if( values.length == 0 ) {
return null;
}
if (discreteMultiValueHighlighting && values.length > 1) {
fragInfos = discreteMultiValueHighlighting(fragInfos, values);
}
fragInfos = getWeightedFragInfoList(fragInfos);
int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size();
List<String> fragments = new ArrayList<String>( limitFragments );
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = { 0 };
for( int n = 0; n < limitFragments; n++ ){
WeightedFragInfo fragInfo = fragInfos.get( n );
fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) );
}
return fragments.toArray( new String[fragments.size()] );
}
protected Field[] getFields( IndexReader reader, int docId, final String fieldName) throws IOException {
// according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
final List<Field> fields = new ArrayList<Field>();
reader.document(docId, new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, String value) {
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasVectors());
fields.add(new Field(fieldInfo.name, value, ft));
}
@Override
public Status needsField(FieldInfo fieldInfo) {
return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO;
}
});
return fields.toArray(new Field[fields.size()]);
}
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
String[] preTags, String[] postTags, Encoder encoder ){
StringBuilder fragment = new StringBuilder();
final int s = fragInfo.getStartOffset();
int[] modifiedStartOffset = { s };
String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset );
int srcIndex = 0;
for( SubInfo subInfo : fragInfo.getSubInfos() ){
for( Toffs to : subInfo.getTermsOffsets() ){
fragment
.append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) )
.append( getPreTag( preTags, subInfo.getSeqnum() ) )
.append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) )
.append( getPostTag( postTags, subInfo.getSeqnum() ) );
srcIndex = to.getEndOffset() - modifiedStartOffset[0];
}
}
fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
return fragment.toString();
}
protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset, int[] modifiedStartOffset ){
while( buffer.length() < endOffset && index[0] < values.length ){
buffer.append( values[index[0]++].stringValue() );
buffer.append( getMultiValuedSeparator() );
}
int bufferLength = buffer.length();
// we added the multi value char to the last buffer, ignore it
if (values[index[0] - 1].fieldType().tokenized()) {
bufferLength--;
}
int eo = bufferLength < endOffset ? bufferLength : boundaryScanner.findEndOffset( buffer, endOffset );
modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset );
return buffer.substring( modifiedStartOffset[0], eo );
}
protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset ){
while( buffer.length() < endOffset && index[0] < values.length ){
buffer.append( values[index[0]].stringValue() );
buffer.append( multiValuedSeparator );
index[0]++;
}
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
return buffer.substring( startOffset, eo );
}
protected List<WeightedFragInfo> discreteMultiValueHighlighting(List<WeightedFragInfo> fragInfos, Field[] fields) {
Map<String, List<WeightedFragInfo>> fieldNameToFragInfos = new HashMap<String, List<WeightedFragInfo>>();
for (Field field : fields) {
fieldNameToFragInfos.put(field.name(), new ArrayList<WeightedFragInfo>());
}
fragInfos: for (WeightedFragInfo fragInfo : fragInfos) {
int fieldStart;
int fieldEnd = 0;
for (Field field : fields) {
if (field.stringValue().isEmpty()) {
fieldEnd++;
continue;
}
fieldStart = fieldEnd;
fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart &&
fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
fieldNameToFragInfos.get(field.name()).add(fragInfo);
continue fragInfos;
}
if (fragInfo.getSubInfos().isEmpty()) {
continue fragInfos;
}
Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
continue;
}
int fragStart = fieldStart;
if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
fragStart = fragInfo.getStartOffset();
}
int fragEnd = fieldEnd;
if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
fragEnd = fragInfo.getEndOffset();
}
List<SubInfo> subInfos = new ArrayList<SubInfo>();
WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, fragInfo.getTotalBoost());
Iterator<SubInfo> subInfoIterator = fragInfo.getSubInfos().iterator();
while (subInfoIterator.hasNext()) {
SubInfo subInfo = subInfoIterator.next();
List<Toffs> toffsList = new ArrayList<Toffs>();
Iterator<Toffs> toffsIterator = subInfo.getTermsOffsets().iterator();
while (toffsIterator.hasNext()) {
Toffs toffs = toffsIterator.next();
if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) {
toffsList.add(toffs);
toffsIterator.remove();
}
}
if (!toffsList.isEmpty()) {
subInfos.add(new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum()));
}
if (subInfo.getTermsOffsets().isEmpty()) {
subInfoIterator.remove();
}
}
fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
}
}
List<WeightedFragInfo> result = new ArrayList<WeightedFragInfo>();
for (List<WeightedFragInfo> weightedFragInfos : fieldNameToFragInfos.values()) {
result.addAll(weightedFragInfos);
}
CollectionUtil.timSort(result, new Comparator<WeightedFragInfo>() {
@Override
public int compare(XFieldFragList.WeightedFragInfo info1, XFieldFragList.WeightedFragInfo info2) {
return info1.getStartOffset() - info2.getStartOffset();
}
});
return result;
}
public void setMultiValuedSeparator( char separator ){
multiValuedSeparator = separator;
}
public char getMultiValuedSeparator(){
return multiValuedSeparator;
}
public boolean isDiscreteMultiValueHighlighting() {
return discreteMultiValueHighlighting;
}
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
}
protected String getPreTag( int num ){
return getPreTag( preTags, num );
}
protected String getPostTag( int num ){
return getPostTag( postTags, num );
}
protected String getPreTag( String[] preTags, int num ){
int n = num % preTags.length;
return preTags[n];
}
protected String getPostTag( String[] postTags, int num ){
int n = num % postTags.length;
return postTags[n];
}
}

View File

@ -0,0 +1,223 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Encoder;
import java.io.IOException;
/**
* Another highlighter implementation.
*
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XFastVectorHighlighter {
public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
public static final boolean DEFAULT_FIELD_MATCH = true;
private final boolean phraseHighlight;
private final boolean fieldMatch;
private final XFragListBuilder fragListBuilder;
private final XFragmentsBuilder fragmentsBuilder;
private int phraseLimit = Integer.MAX_VALUE;
/**
* the default constructor.
*/
public XFastVectorHighlighter(){
this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH );
}
/**
* a constructor. Using {@link XSimpleFragListBuilder} and {@link XScoreOrderFragmentsBuilder}.
*
* @param phraseHighlight true or false for phrase highlighting
* @param fieldMatch true of false for field matching
*/
public XFastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){
this( phraseHighlight, fieldMatch, new XSimpleFragListBuilder(), new XScoreOrderFragmentsBuilder() );
}
/**
* a constructor. A {@link XFragListBuilder} and a {@link XFragmentsBuilder} can be specified (plugins).
*
* @param phraseHighlight true of false for phrase highlighting
* @param fieldMatch true of false for field matching
* @param fragListBuilder an instance of {@link XFragListBuilder}
* @param fragmentsBuilder an instance of {@link XFragmentsBuilder}
*/
public XFastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch,
XFragListBuilder fragListBuilder, XFragmentsBuilder fragmentsBuilder ){
this.phraseHighlight = phraseHighlight;
this.fieldMatch = fieldMatch;
this.fragListBuilder = fragListBuilder;
this.fragmentsBuilder = fragmentsBuilder;
}
/**
* create a {@link XFieldQuery} object.
*
* @param query a query
* @return the created {@link XFieldQuery} object
*/
public XFieldQuery getFieldQuery( Query query ) {
// TODO: should we deprecate this?
// because if there is no reader, then we cannot rewrite MTQ.
try {
return new XFieldQuery( query, null, phraseHighlight, fieldMatch );
} catch (IOException e) {
// should never be thrown when reader is null
throw new RuntimeException (e);
}
}
/**
* create a {@link XFieldQuery} object.
*
* @param query a query
* @return the created {@link XFieldQuery} object
*/
public XFieldQuery getFieldQuery( Query query, IndexReader reader ) throws IOException {
return new XFieldQuery( query, reader, phraseHighlight, fieldMatch );
}
/**
* return the best fragment.
*
* @param fieldQuery {@link XFieldQuery} object
* @param reader {@link IndexReader} of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fragCharSize the length (number of chars) of a fragment
* @return the best fragment (snippet) string
* @throws IOException If there is a low-level I/O error
*/
public final String getBestFragment( final XFieldQuery fieldQuery, IndexReader reader, int docId,
String fieldName, int fragCharSize ) throws IOException {
XFieldFragList fieldFragList =
getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize );
return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList );
}
/**
* return the best fragments.
*
* @param fieldQuery {@link XFieldQuery} object
* @param reader {@link IndexReader} of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fragCharSize the length (number of chars) of a fragment
* @param maxNumFragments maximum number of fragments
* @return created fragments or null when no fragments created.
* size of the array can be less than maxNumFragments
* @throws IOException If there is a low-level I/O error
*/
public final String[] getBestFragments( final XFieldQuery fieldQuery, IndexReader reader, int docId,
String fieldName, int fragCharSize, int maxNumFragments ) throws IOException {
XFieldFragList fieldFragList =
getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize );
return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments );
}
/**
* return the best fragment.
*
* @param fieldQuery {@link XFieldQuery} object
* @param reader {@link IndexReader} of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fragCharSize the length (number of chars) of a fragment
* @param fragListBuilder {@link XFragListBuilder} object
* @param fragmentsBuilder {@link XFragmentsBuilder} object
* @param preTags pre-tags to be used to highlight terms
* @param postTags post-tags to be used to highlight terms
* @param encoder an encoder that generates encoded text
* @return the best fragment (snippet) string
* @throws IOException If there is a low-level I/O error
*/
public final String getBestFragment( final XFieldQuery fieldQuery, IndexReader reader, int docId,
String fieldName, int fragCharSize,
XFragListBuilder fragListBuilder, XFragmentsBuilder fragmentsBuilder,
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
XFieldFragList fieldFragList = getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize );
return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList, preTags, postTags, encoder );
}
/**
* return the best fragments.
*
* @param fieldQuery {@link XFieldQuery} object
* @param reader {@link IndexReader} of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fragCharSize the length (number of chars) of a fragment
* @param maxNumFragments maximum number of fragments
* @param fragListBuilder {@link XFragListBuilder} object
* @param fragmentsBuilder {@link XFragmentsBuilder} object
* @param preTags pre-tags to be used to highlight terms
* @param postTags post-tags to be used to highlight terms
* @param encoder an encoder that generates encoded text
* @return created fragments or null when no fragments created.
* size of the array can be less than maxNumFragments
* @throws IOException If there is a low-level I/O error
*/
public final String[] getBestFragments( final XFieldQuery fieldQuery, IndexReader reader, int docId,
String fieldName, int fragCharSize, int maxNumFragments,
XFragListBuilder fragListBuilder, XFragmentsBuilder fragmentsBuilder,
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
XFieldFragList fieldFragList =
getFieldFragList( fragListBuilder, fieldQuery, reader, docId, fieldName, fragCharSize );
return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments,
preTags, postTags, encoder );
}
private XFieldFragList getFieldFragList( XFragListBuilder fragListBuilder,
final XFieldQuery fieldQuery, IndexReader reader, int docId,
String fieldName, int fragCharSize ) throws IOException {
XFieldTermStack fieldTermStack = new XFieldTermStack( reader, docId, fieldName, fieldQuery );
XFieldPhraseList fieldPhraseList = new XFieldPhraseList( fieldTermStack, fieldQuery, phraseLimit );
return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
}
/**
* return whether phraseHighlight or not.
*
* @return whether phraseHighlight or not
*/
public boolean isPhraseHighlight(){ return phraseHighlight; }
/**
* return whether fieldMatch or not.
*
* @return whether fieldMatch or not
*/
public boolean isFieldMatch(){ return fieldMatch; }
/**
* @return the maximum number of phrases to analyze when searching for the highest-scoring phrase.
*/
public int getPhraseLimit () { return phraseLimit; }
/**
* set the maximum number of phrases to analyze when searching for the highest-scoring phrase.
* The default is unlimited (Integer.MAX_VALUE).
*/
public void setPhraseLimit (int phraseLimit) { this.phraseLimit = phraseLimit; }
}

View File

@ -0,0 +1,142 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo;
import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo.Toffs;
import java.util.ArrayList;
import java.util.List;
/**
* FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
* to create fragments (snippets).
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public abstract class XFieldFragList {
private List<WeightedFragInfo> fragInfos = new ArrayList<WeightedFragInfo>();
/**
* a constructor.
*
* @param fragCharSize the length (number of chars) of a fragment
*/
public XFieldFragList( int fragCharSize ){
}
/**
* convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos
*
* @param startOffset start offset of the fragment
* @param endOffset end offset of the fragment
* @param phraseInfoList list of WeightedPhraseInfo objects
*/
public abstract void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList );
/**
* return the list of WeightedFragInfos.
*
* @return fragInfos.
*/
public List<WeightedFragInfo> getFragInfos() {
return fragInfos;
}
/**
* List of term offsets + weight for a frag info
*/
public static class WeightedFragInfo {
private List<SubInfo> subInfos;
private float totalBoost;
private int startOffset;
private int endOffset;
public WeightedFragInfo( int startOffset, int endOffset, List<SubInfo> subInfos, float totalBoost ){
this.startOffset = startOffset;
this.endOffset = endOffset;
this.totalBoost = totalBoost;
this.subInfos = subInfos;
}
public List<SubInfo> getSubInfos(){
return subInfos;
}
public float getTotalBoost(){
return totalBoost;
}
public int getStartOffset(){
return startOffset;
}
public int getEndOffset(){
return endOffset;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( "subInfos=(" );
for( SubInfo si : subInfos )
sb.append( si.toString() );
sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
return sb.toString();
}
/**
* Represents the list of term offsets for some text
*/
public static class SubInfo {
private final String text; // unnecessary member, just exists for debugging purpose
private final List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
// but if position-gap > 1 and slop > 0 then size() could be greater than 1
private int seqnum;
public SubInfo( String text, List<Toffs> termsOffsets, int seqnum ){
this.text = text;
this.termsOffsets = termsOffsets;
this.seqnum = seqnum;
}
public List<Toffs> getTermsOffsets(){
return termsOffsets;
}
public int getSeqnum(){
return seqnum;
}
public String getText(){
return text;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( text ).append( '(' );
for( Toffs to : termsOffsets )
sb.append( to.toString() );
sb.append( ')' );
return sb.toString();
}
}
}
}

View File

@ -0,0 +1,261 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.XFieldTermStack.TermInfo;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
* to create a FieldFragList object.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XFieldPhraseList {
LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<WeightedPhraseInfo>();
/**
* create a FieldPhraseList that has no limit on the number of phrases to analyze
*
* @param fieldTermStack FieldTermStack object
* @param fieldQuery FieldQuery object
*/
public XFieldPhraseList( XFieldTermStack fieldTermStack, XFieldQuery fieldQuery){
this (fieldTermStack, fieldQuery, Integer.MAX_VALUE);
}
/**
* return the list of WeightedPhraseInfo.
*
* @return phraseList.
*/
public List<WeightedPhraseInfo> getPhraseList() {
return phraseList;
}
/**
* a constructor.
*
* @param fieldTermStack FieldTermStack object
* @param fieldQuery FieldQuery object
* @param phraseLimit maximum size of phraseList
*/
public XFieldPhraseList( XFieldTermStack fieldTermStack, XFieldQuery fieldQuery, int phraseLimit ){
final String field = fieldTermStack.getFieldName();
QueryPhraseMap qpm = fieldQuery.getRootMap(field);
if (qpm != null) {
LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>();
extractPhrases(fieldTermStack.termList, qpm, phraseCandidate, 0);
assert phraseCandidate.size() == 0;
}
}
void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
if (terms.isEmpty()) {
if (longest > 0) {
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
}
return;
}
ArrayList<TermInfo> samePositionTerms = new ArrayList<TermInfo>();
do {
samePositionTerms.add(terms.pop());
} while (!terms.isEmpty() && terms.get(0).getPosition() == samePositionTerms.get(0).getPosition());
// try all next terms at the same position
for (TermInfo nextTerm : samePositionTerms) {
QueryPhraseMap nextMap = currMap.getTermMap(nextTerm.getText());
if (nextMap != null) {
phraseCandidate.add(nextTerm);
int l = longest;
if(nextMap.isValidTermOrPhrase( phraseCandidate ) ){
l = phraseCandidate.size();
}
extractPhrases(terms, nextMap, phraseCandidate, l);
phraseCandidate.removeLast();
}
}
// ignore the next term
extractPhrases(terms, currMap, phraseCandidate, longest);
// add terms back
for (TermInfo nextTerm : samePositionTerms) {
terms.push(nextTerm);
}
}
public void addIfNoOverlap( WeightedPhraseInfo wpi ){
for( WeightedPhraseInfo existWpi : getPhraseList() ){
if( existWpi.isOffsetOverlap( wpi ) ) {
// WeightedPhraseInfo.addIfNoOverlap() dumps the second part of, for example, hyphenated words (social-economics).
// The result is that all informations in TermInfo are lost and not available for further operations.
existWpi.getTermsInfos().addAll( wpi.getTermsInfos() );
return;
}
}
getPhraseList().add( wpi );
}
/**
* Represents the list of term offsets and boost for some text
*/
public static class WeightedPhraseInfo {
private String text; // unnecessary member, just exists for debugging purpose
private List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
// but if position-gap > 1 and slop > 0 then size() could be greater than 1
private float boost; // query boost
private int seqnum;
private ArrayList<TermInfo> termsInfos;
/**
* @return the text
*/
public String getText() {
return text;
}
/**
* @return the termsOffsets
*/
public List<Toffs> getTermsOffsets() {
return termsOffsets;
}
/**
* @return the boost
*/
public float getBoost() {
return boost;
}
/**
* @return the termInfos
*/
public List<TermInfo> getTermsInfos() {
return termsInfos;
}
public WeightedPhraseInfo( List<TermInfo> terms, float boost ){
this( terms, boost, 0 );
}
public WeightedPhraseInfo( List<TermInfo> terms, float boost, int seqnum ){
this.boost = boost;
this.seqnum = seqnum;
// We keep TermInfos for further operations
termsInfos = new ArrayList<TermInfo>( terms );
termsOffsets = new ArrayList<Toffs>( terms.size() );
TermInfo ti = terms.get( 0 );
termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
if( terms.size() == 1 ){
text = ti.getText();
return;
}
StringBuilder sb = new StringBuilder();
sb.append( ti.getText() );
int pos = ti.getPosition();
for( int i = 1; i < terms.size(); i++ ){
ti = terms.get( i );
sb.append( ti.getText() );
if( ti.getPosition() - pos == 1 ){
Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
to.setEndOffset( ti.getEndOffset() );
}
else{
termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
}
pos = ti.getPosition();
}
text = sb.toString();
}
public int getStartOffset(){
return termsOffsets.get( 0 ).startOffset;
}
public int getEndOffset(){
return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
}
public boolean isOffsetOverlap( WeightedPhraseInfo other ){
int so = getStartOffset();
int eo = getEndOffset();
int oso = other.getStartOffset();
int oeo = other.getEndOffset();
if( so <= oso && oso < eo ) return true;
if( so < oeo && oeo <= eo ) return true;
if( oso <= so && so < oeo ) return true;
if( oso < eo && eo <= oeo ) return true;
return false;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( text ).append( '(' ).append( boost ).append( ")(" );
for( Toffs to : termsOffsets ){
sb.append( to );
}
sb.append( ')' );
return sb.toString();
}
/**
* @return the seqnum
*/
public int getSeqnum() {
return seqnum;
}
/**
* Term offsets (start + end)
*/
public static class Toffs {
private int startOffset;
private int endOffset;
public Toffs( int startOffset, int endOffset ){
this.startOffset = startOffset;
this.endOffset = endOffset;
}
public void setEndOffset( int endOffset ){
this.endOffset = endOffset;
}
public int getStartOffset(){
return startOffset;
}
public int getEndOffset(){
return endOffset;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
return sb.toString();
}
}
}
}

View File

@ -0,0 +1,520 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.vectorhighlight.XFieldTermStack.TermInfo;
import org.apache.lucene.util.SorterTemplate;
import java.io.IOException;
import java.util.*;
/**
* FieldQuery breaks down query object into terms/phrases and keeps
* them in a QueryPhraseMap structure.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XFieldQuery {
final boolean fieldMatch;
// fieldMatch==true, Map<fieldName,QueryPhraseMap>
// fieldMatch==false, Map<null,QueryPhraseMap>
Map<String, QueryPhraseMap> rootMaps = new HashMap<String, QueryPhraseMap>();
// fieldMatch==true, Map<fieldName,setOfTermsInQueries>
// fieldMatch==false, Map<null,setOfTermsInQueries>
Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>();
int termOrPhraseNumber; // used for colored tag support
// The maximum number of different matching terms accumulated from any one MultiTermQuery
private static final int MAX_MTQ_TERMS = 1024;
XFieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this.fieldMatch = fieldMatch;
Set<Query> flatQueries = new LinkedHashSet<Query>();
flatten( query, reader, flatQueries );
saveTerms( flatQueries, reader );
Collection<Query> expandQueries = expand( flatQueries );
for( Query flatQuery : expandQueries ){
QueryPhraseMap rootMap = getRootMap( flatQuery );
rootMap.add( flatQuery, reader );
if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)flatQuery;
if( pq.getTerms().length > 1 ){
for( Term term : pq.getTerms() )
rootMap.addTerm( term, flatQuery.getBoost() );
}
}
}
}
/** For backwards compatibility you can initialize FieldQuery without
* an IndexReader, which is only required to support MultiTermQuery
*/
XFieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this (query, null, phraseHighlight, fieldMatch);
}
void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
if( sourceQuery instanceof BooleanQuery ){
BooleanQuery bq = (BooleanQuery)sourceQuery;
for( BooleanClause clause : bq.getClauses() ){
if( !clause.isProhibited() )
flatten( clause.getQuery(), reader, flatQueries );
}
} else if( sourceQuery instanceof DisjunctionMaxQuery ){
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
for( Query query : dmq ){
flatten( query, reader, flatQueries );
}
}
else if( sourceQuery instanceof TermQuery ){
if( !flatQueries.contains( sourceQuery ) )
flatQueries.add( sourceQuery );
}
else if( sourceQuery instanceof PhraseQuery ){
if( !flatQueries.contains( sourceQuery ) ){
PhraseQuery pq = (PhraseQuery)sourceQuery;
if( pq.getTerms().length > 1 )
flatQueries.add( pq );
else if( pq.getTerms().length == 1 ){
flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
}
}
} else if (sourceQuery instanceof ConstantScoreQuery) {
final Query q = ((ConstantScoreQuery) sourceQuery).getQuery();
if (q != null) {
flatten(q, reader, flatQueries);
}
} else if (sourceQuery instanceof FilteredQuery) {
final Query q = ((FilteredQuery) sourceQuery).getQuery();
if (q != null) {
flatten(q, reader, flatQueries);
}
} else if (reader != null){
Query query = sourceQuery;
if (sourceQuery instanceof MultiTermQuery) {
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
query = copy;
}
Query rewritten = query.rewrite(reader);
if (rewritten != query) {
// only rewrite once and then flatten again - the rewritten query could have a speacial treatment
// if this method is overwritten in a subclass.
flatten(rewritten, reader, flatQueries);
}
// if the query is already rewritten we discard it
}
// else discard queries
}
/*
* Create expandQueries from flatQueries.
*
* expandQueries := flatQueries + overlapped phrase queries
*
* ex1) flatQueries={a,b,c}
* => expandQueries={a,b,c}
* ex2) flatQueries={a,"b c","c d"}
* => expandQueries={a,"b c","c d","b c d"}
*/
Collection<Query> expand( Collection<Query> flatQueries ){
Set<Query> expandQueries = new LinkedHashSet<Query>();
for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){
Query query = i.next();
i.remove();
expandQueries.add( query );
if( !( query instanceof PhraseQuery ) ) continue;
for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){
Query qj = j.next();
if( !( qj instanceof PhraseQuery ) ) continue;
checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
}
}
return expandQueries;
}
/*
* Check if PhraseQuery A and B have overlapped part.
*
* ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
* ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
* ex3) A="a b", B="c d" => no overlap; expandQueries={}
*/
private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, PhraseQuery b ){
if( a.getSlop() != b.getSlop() ) return;
Term[] ats = a.getTerms();
Term[] bts = b.getTerms();
if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
}
/*
* Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
*
* ex1) src="a b", dest="c d" => no overlap
* ex2) src="a b", dest="a b c" => no overlap
* ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"}
* ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"}
* ex5) src="a b c", dest="b c" => no overlap
* ex6) src="a b c", dest="b" => no overlap
* ex7) src="a a a a", dest="a a a" => overlap;
* expandQueries={"a a a a a","a a a a a a"}
* ex8) src="a b c d", dest="b c" => no overlap
*/
private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){
// beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
// is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
// converts PhraseQuery to TermQuery)
for( int i = 1; i < src.length; i++ ){
boolean overlap = true;
for( int j = i; j < src.length; j++ ){
if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){
overlap = false;
break;
}
}
if( overlap && src.length - i < dest.length ){
PhraseQuery pq = new PhraseQuery();
for( Term srcTerm : src )
pq.add( srcTerm );
for( int k = src.length - i; k < dest.length; k++ ){
pq.add( new Term( src[0].field(), dest[k].text() ) );
}
pq.setSlop( slop );
pq.setBoost( boost );
if(!expandQueries.contains( pq ) )
expandQueries.add( pq );
}
}
}
QueryPhraseMap getRootMap( Query query ){
String key = getKey( query );
QueryPhraseMap map = rootMaps.get( key );
if( map == null ){
map = new QueryPhraseMap( this );
rootMaps.put( key, map );
}
return map;
}
/*
* Return 'key' string. 'key' is the field name of the Query.
* If not fieldMatch, 'key' will be null.
*/
private String getKey( Query query ){
if( !fieldMatch ) return null;
if( query instanceof TermQuery )
return ((TermQuery)query).getTerm().field();
else if ( query instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)query;
Term[] terms = pq.getTerms();
return terms[0].field();
}
else if (query instanceof MultiTermQuery) {
return ((MultiTermQuery)query).getField();
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
/*
* Save the set of terms in the queries to termSetMap.
*
* ex1) q=name:john
* - fieldMatch==true
* termSetMap=Map<"name",Set<"john">>
* - fieldMatch==false
* termSetMap=Map<null,Set<"john">>
*
* ex2) q=name:john title:manager
* - fieldMatch==true
* termSetMap=Map<"name",Set<"john">,
* "title",Set<"manager">>
* - fieldMatch==false
* termSetMap=Map<null,Set<"john","manager">>
*
* ex3) q=name:"john lennon"
* - fieldMatch==true
* termSetMap=Map<"name",Set<"john","lennon">>
* - fieldMatch==false
* termSetMap=Map<null,Set<"john","lennon">>
*/
void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
for( Query query : flatQueries ){
Set<String> termSet = getTermSet( query );
if( query instanceof TermQuery )
termSet.add( ((TermQuery)query).getTerm().text() );
else if( query instanceof PhraseQuery ){
for( Term term : ((PhraseQuery)query).getTerms() )
termSet.add( term.text() );
}
else if (query instanceof MultiTermQuery && reader != null) {
BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
for (BooleanClause clause : mtqTerms.getClauses()) {
termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
}
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
}
private Set<String> getTermSet( Query query ){
String key = getKey( query );
Set<String> set = termSetMap.get( key );
if( set == null ){
set = new HashSet<String>();
termSetMap.put( key, set );
}
return set;
}
Set<String> getTermSet( String field ){
return termSetMap.get( fieldMatch ? field : null );
}
/**
*
* @return QueryPhraseMap
*/
public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
QueryPhraseMap rootMap = getRootMap( fieldName );
return rootMap == null ? null : rootMap.subMap.get( term );
}
/**
*
* @return QueryPhraseMap
*/
public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){
QueryPhraseMap root = getRootMap( fieldName );
if( root == null ) return null;
return root.searchPhrase( phraseCandidate );
}
public QueryPhraseMap getRootMap( String fieldName ){
return rootMaps.get( fieldMatch ? fieldName : null );
}
int nextTermOrPhraseNumber(){
return termOrPhraseNumber++;
}
/**
* Internal structure of a query for highlighting: represents
* a nested query structure
*/
public static class QueryPhraseMap {
boolean terminal;
int slop; // valid if terminal == true and phraseHighlight == true
float boost; // valid if terminal == true
int[] positions; // valid if terminal == true
int termOrPhraseNumber; // valid if terminal == true
XFieldQuery fieldQuery;
Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
public QueryPhraseMap( XFieldQuery fieldQuery ){
this.fieldQuery = fieldQuery;
}
void addTerm( Term term, float boost ){
QueryPhraseMap map = getOrNewMap( subMap, term.text() );
map.markTerminal( boost );
}
private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){
QueryPhraseMap map = subMap.get( term );
if( map == null ){
map = new QueryPhraseMap( fieldQuery );
subMap.put( term, map );
}
return map;
}
void add( Query query, IndexReader reader ) {
if( query instanceof TermQuery ){
addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
}
else if( query instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)query;
final Term[] terms = pq.getTerms();
final int[] positions = pq.getPositions();
new SorterTemplate() {
@Override
protected void swap(int i, int j) {
Term tmpTerm = terms[i];
terms[i] = terms[j];
terms[j] = tmpTerm;
int tmpPos = positions[i];
positions[i] = positions[j];
positions[j] = tmpPos;
}
@Override
protected int compare(int i, int j) {
return positions[i] - positions[j];
}
@Override
protected void setPivot(int i) {
throw new UnsupportedOperationException();
}
@Override
protected int comparePivot(int j) {
throw new UnsupportedOperationException();
}
}.mergeSort(0, terms.length - 1);
addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
private int numTermsAtSamePosition(int[] positions, int i) {
int numTermsAtSamePosition = 1;
for (int j = i + 1; j < positions.length; ++j) {
if (positions[j] == positions[i]) {
++numTermsAtSamePosition;
}
}
return numTermsAtSamePosition;
}
private void addToMap(PhraseQuery pq, Term[] terms, int[] positions, int i, Map<String, QueryPhraseMap> map, int slop) {
int numTermsAtSamePosition = numTermsAtSamePosition(positions, i);
for (int j = 0; j < numTermsAtSamePosition; ++j) {
QueryPhraseMap qpm = getOrNewMap(map, terms[i + j].text());
if (i + numTermsAtSamePosition == terms.length) {
qpm.markTerminal(pq.getSlop(), pq.getBoost(), uniquePositions(positions));
} else {
addToMap(pq, terms, positions, i + numTermsAtSamePosition, qpm.subMap, slop);
}
}
if (slop > 2 && i + numTermsAtSamePosition < terms.length) {
Term[] otherTerms = Arrays.copyOf(terms, terms.length);
int[] otherPositions = Arrays.copyOf(positions, positions.length);
final int nextTermAtSamePosition = numTermsAtSamePosition(positions, i + numTermsAtSamePosition);
System.arraycopy(terms, i + numTermsAtSamePosition, otherTerms, i, nextTermAtSamePosition);
System.arraycopy(positions, i + numTermsAtSamePosition, otherPositions, i, nextTermAtSamePosition);
System.arraycopy(terms, i, otherTerms, i + nextTermAtSamePosition, numTermsAtSamePosition);
System.arraycopy(positions, i, otherPositions, i + nextTermAtSamePosition, numTermsAtSamePosition);
addToMap(pq, otherTerms, otherPositions, i, map, slop - 2);
}
}
private int[] uniquePositions(int[] positions) {
int uniqueCount = 1;
for (int i = 1; i < positions.length; ++i) {
if (positions[i] != positions[i - 1]) {
++uniqueCount;
}
}
if (uniqueCount == positions.length) {
return positions;
}
int[] result = new int[uniqueCount];
result[0] = positions[0];
for (int i = 1, j = 1; i < positions.length; ++i) {
if (positions[i] != positions[i - 1]) {
result[j++] = positions[i];
}
}
return result;
}
public QueryPhraseMap getTermMap( String term ){
return subMap.get( term );
}
private void markTerminal( float boost ){
markTerminal( 0, boost, null );
}
private void markTerminal( int slop, float boost, int[] positions ){
if (slop > this.slop || (slop == this.slop && boost > this.boost)) {
this.terminal = true;
this.slop = slop;
this.boost = boost;
this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
this.positions = positions;
}
}
public boolean isTerminal(){
return terminal;
}
public int getSlop(){
return slop;
}
public float getBoost(){
return boost;
}
public int getTermOrPhraseNumber(){
return termOrPhraseNumber;
}
public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){
QueryPhraseMap currMap = this;
for( TermInfo ti : phraseCandidate ){
currMap = currMap.subMap.get( ti.getText() );
if( currMap == null ) return null;
}
return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
}
public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){
// check terminal
if( !terminal ) return false;
// if the candidate is a term, it is valid
if( phraseCandidate.size() == 1 ) return true;
assert phraseCandidate.size() == positions.length;
// else check whether the candidate is valid phrase
// compare position-gaps between terms to slop
int pos = phraseCandidate.get( 0 ).getPosition();
int totalDistance = 0;
for( int i = 1; i < phraseCandidate.size(); i++ ){
int nextPos = phraseCandidate.get( i ).getPosition();
final int expectedDelta = positions[i] - positions[i - 1];
final int actualDelta = nextPos - pos;
totalDistance += Math.abs(expectedDelta - actualDelta);
pos = nextPos;
}
return totalDistance <= slop;
}
}
}

View File

@ -0,0 +1,209 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
/**
* <code>FieldTermStack</code> is a stack that keeps query terms in the specified field
* of the document to be highlighted.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XFieldTermStack {
private final String fieldName;
LinkedList<TermInfo> termList = new LinkedList<TermInfo>();
//public static void main( String[] args ) throws Exception {
// Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
// QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer );
// Query query = parser.parse( "a x:b" );
// FieldQuery fieldQuery = new FieldQuery( query, true, false );
// Directory dir = new RAMDirectory();
// IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
// Document doc = new Document();
// FieldType ft = new FieldType(TextField.TYPE_STORED);
// ft.setStoreTermVectors(true);
// ft.setStoreTermVectorOffsets(true);
// ft.setStoreTermVectorPositions(true);
// doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
// doc.add( new Field( "f", ft, "b a b a f" ) );
// writer.addDocument( doc );
// writer.close();
// IndexReader reader = IndexReader.open(dir1);
// new FieldTermStack( reader, 0, "f", fieldQuery );
// reader.close();
//}
/**
* a constructor.
*
* @param reader IndexReader of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fieldQuery FieldQuery object
* @throws IOException If there is a low-level I/O error
*/
public XFieldTermStack( IndexReader reader, int docId, String fieldName, final XFieldQuery fieldQuery ) throws IOException {
this.fieldName = fieldName;
Set<String> termSet = fieldQuery.getTermSet( fieldName );
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
final Fields vectors = reader.getTermVectors(docId);
if (vectors == null) {
// null snippet
return;
}
final Terms vector = vectors.terms(fieldName);
if (vector == null) {
// null snippet
return;
}
final CharsRef spare = new CharsRef();
final TermsEnum termsEnum = vector.iterator(null);
DocsAndPositionsEnum dpEnum = null;
BytesRef text;
int numDocs = reader.maxDoc();
final List<TermInfo> termList = new ArrayList<TermInfo>();
while ((text = termsEnum.next()) != null) {
UnicodeUtil.UTF8toUTF16(text, spare);
final String term = spare.toString();
if (!termSet.contains(term)) {
continue;
}
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
if (dpEnum == null) {
// null snippet
return;
}
dpEnum.nextDoc();
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );
final int freq = dpEnum.freq();
for(int i = 0;i < freq;i++) {
int pos = dpEnum.nextPosition();
if (dpEnum.startOffset() < 0) {
return; // no offsets, null snippet
}
termList.add( new TermInfo( term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight ) );
}
}
// sort by position
CollectionUtil.timSort(termList);
this.termList.addAll(termList);
}
/**
* @return field name
*/
public String getFieldName(){
return fieldName;
}
/**
* @return the top TermInfo object of the stack
*/
public TermInfo pop(){
return termList.poll();
}
/**
* Return the top TermInfo object of the stack without removing it.
*/
public TermInfo peek() {
return termList.peek();
}
/**
* @param termInfo the TermInfo object to be put on the top of the stack
*/
public void push( TermInfo termInfo ){
termList.push( termInfo );
}
/**
* to know whether the stack is empty
*
* @return true if the stack is empty, false if not
*/
public boolean isEmpty(){
return termList == null || termList.size() == 0;
}
/**
* Single term with its position/offsets in the document and IDF weight
*/
public static class TermInfo implements Comparable<TermInfo>{
private final String text;
private final int startOffset;
private final int endOffset;
private final int position;
// IDF-weight of this term
private final float weight;
public TermInfo( String text, int startOffset, int endOffset, int position, float weight ){
this.text = text;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.position = position;
this.weight = weight;
}
public String getText(){ return text; }
public int getStartOffset(){ return startOffset; }
public int getEndOffset(){ return endOffset; }
public int getPosition(){ return position; }
public float getWeight(){ return weight; }
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' );
return sb.toString();
}
@Override
public int compareTo( TermInfo o ){
return ( this.position - o.position );
}
}
}

View File

@ -0,0 +1,35 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* FragListBuilder is an interface for FieldFragList builder classes.
* A FragListBuilder class can be plugged in to Highlighter.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public interface XFragListBuilder {
/**
* create a FieldFragList.
*
* @param fieldPhraseList FieldPhraseList object
* @param fragCharSize the length (number of chars) of a fragment
* @return the created FieldFragList object
*/
public XFieldFragList createFieldFragList( XFieldPhraseList fieldPhraseList, int fragCharSize );
}

View File

@ -0,0 +1,96 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.highlight.Encoder;
import java.io.IOException;
/**
* {@link org.apache.lucene.search.vectorhighlight.XFragmentsBuilder} is an interface for fragments (snippets) builder classes.
* A {@link org.apache.lucene.search.vectorhighlight.XFragmentsBuilder} class can be plugged in to
* {@link org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter}.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public interface XFragmentsBuilder {
/**
* create a fragment.
*
* @param reader IndexReader of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fieldFragList FieldFragList object
* @return a created fragment or null when no fragment created
* @throws IOException If there is a low-level I/O error
*/
public String createFragment( IndexReader reader, int docId, String fieldName,
XFieldFragList fieldFragList ) throws IOException;
/**
* create multiple fragments.
*
* @param reader IndexReader of the index
* @param docId document id to be highlighter
* @param fieldName field of the document to be highlighted
* @param fieldFragList FieldFragList object
* @param maxNumFragments maximum number of fragments
* @return created fragments or null when no fragments created.
* size of the array can be less than maxNumFragments
* @throws IOException If there is a low-level I/O error
*/
public String[] createFragments( IndexReader reader, int docId, String fieldName,
XFieldFragList fieldFragList, int maxNumFragments ) throws IOException;
/**
* create a fragment.
*
* @param reader IndexReader of the index
* @param docId document id to be highlighted
* @param fieldName field of the document to be highlighted
* @param fieldFragList FieldFragList object
* @param preTags pre-tags to be used to highlight terms
* @param postTags post-tags to be used to highlight terms
* @param encoder an encoder that generates encoded text
* @return a created fragment or null when no fragment created
* @throws IOException If there is a low-level I/O error
*/
public String createFragment( IndexReader reader, int docId, String fieldName,
XFieldFragList fieldFragList, String[] preTags, String[] postTags,
Encoder encoder ) throws IOException;
/**
* create multiple fragments.
*
* @param reader IndexReader of the index
* @param docId document id to be highlighter
* @param fieldName field of the document to be highlighted
* @param fieldFragList FieldFragList object
* @param maxNumFragments maximum number of fragments
* @param preTags pre-tags to be used to highlight terms
* @param postTags post-tags to be used to highlight terms
* @param encoder an encoder that generates encoded text
* @return created fragments or null when no fragments created.
* size of the array can be less than maxNumFragments
* @throws IOException If there is a low-level I/O error
*/
public String[] createFragments( IndexReader reader, int docId, String fieldName,
XFieldFragList fieldFragList, int maxNumFragments, String[] preTags, String[] postTags,
Encoder encoder ) throws IOException;
}

View File

@ -0,0 +1,84 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import org.apache.lucene.util.CollectionUtil;
import java.util.Comparator;
import java.util.List;
/**
* An implementation of FragmentsBuilder that outputs score-order fragments.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XScoreOrderFragmentsBuilder extends XBaseFragmentsBuilder {
/**
* a constructor.
*/
public XScoreOrderFragmentsBuilder(){
super();
}
/**
* a constructor.
*
* @param preTags array of pre-tags for markup terms.
* @param postTags array of post-tags for markup terms.
*/
public XScoreOrderFragmentsBuilder( String[] preTags, String[] postTags ){
super( preTags, postTags );
}
public XScoreOrderFragmentsBuilder( BoundaryScanner bs ){
super( bs );
}
public XScoreOrderFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner bs ){
super( preTags, postTags, bs );
}
/**
* Sort by score the list of WeightedFragInfo
*/
@Override
public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
CollectionUtil.timSort( src, new ScoreComparator() );
return src;
}
/**
* Comparator for {@link WeightedFragInfo} by boost, breaking ties
* by offset.
*/
public static class ScoreComparator implements Comparator<WeightedFragInfo> {
@Override
public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
if( o1.getTotalBoost() > o2.getTotalBoost() ) return -1;
else if( o1.getTotalBoost() < o2.getTotalBoost() ) return 1;
// if same score then check startOffset
else{
if( o1.getStartOffset() < o2.getStartOffset() ) return -1;
else if( o1.getStartOffset() > o2.getStartOffset() ) return 1;
}
return 0;
}
}
}

View File

@ -0,0 +1,55 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo;
import java.util.ArrayList;
import java.util.List;
/**
* A simple implementation of {@link XFieldFragList}.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XSimpleFieldFragList extends XFieldFragList {
/**
* a constructor.
*
* @param fragCharSize the length (number of chars) of a fragment
*/
public XSimpleFieldFragList( int fragCharSize ) {
super( fragCharSize );
}
/* (non-Javadoc)
* @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList )
*/
@Override
public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
float totalBoost = 0;
List<SubInfo> subInfos = new ArrayList<SubInfo>();
for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
totalBoost += phraseInfo.getBoost();
}
getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A simple implementation of {@link XFragListBuilder}.
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XSimpleFragListBuilder extends XBaseFragListBuilder {
public XSimpleFragListBuilder() {
super();
}
public XSimpleFragListBuilder(int margin) {
super(margin);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize)
*/
@Override
public XFieldFragList createFieldFragList( XFieldPhraseList fieldPhraseList, int fragCharSize ){
return createFieldFragList( fieldPhraseList, new XSimpleFieldFragList( fragCharSize ), fragCharSize );
}
}

View File

@ -0,0 +1,63 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import java.util.List;
/**
* A simple implementation of FragmentsBuilder.
*
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XSimpleFragmentsBuilder extends XBaseFragmentsBuilder {
/**
* a constructor.
*/
public XSimpleFragmentsBuilder() {
super();
}
/**
* a constructor.
*
* @param preTags array of pre-tags for markup terms.
* @param postTags array of post-tags for markup terms.
*/
public XSimpleFragmentsBuilder( String[] preTags, String[] postTags ) {
super( preTags, postTags );
}
public XSimpleFragmentsBuilder( BoundaryScanner bs ) {
super( bs );
}
public XSimpleFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner bs ) {
super( preTags, postTags, bs );
}
/**
* do nothing. return the source list.
*/
@Override
public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
return src;
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.XFieldPhraseList.WeightedPhraseInfo;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* An implementation class of {@link XFragListBuilder} that generates one {@link WeightedFragInfo} object.
* Typical use case of this class is that you can get an entire field contents
* by using both of this class and {@link XSimpleFragmentsBuilder}.<br/>
* <pre class="prettyprint">
* FastVectorHighlighter h = new FastVectorHighlighter( true, true,
* new SingleFragListBuilder(), new SimpleFragmentsBuilder() );
* </pre>
*/
//LUCENE MONITOR - REMOVE ME WHEN LUCENE 4.5 IS OUT
public class XSingleFragListBuilder implements XFragListBuilder {
@Override
public XFieldFragList createFieldFragList(XFieldPhraseList fieldPhraseList,
int fragCharSize) {
XFieldFragList ffl = new XSimpleFieldFragList( fragCharSize );
List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>();
Iterator<WeightedPhraseInfo> ite = fieldPhraseList.phraseList.iterator();
WeightedPhraseInfo phraseInfo = null;
while( true ){
if( !ite.hasNext() ) break;
phraseInfo = ite.next();
if( phraseInfo == null ) break;
wpil.add( phraseInfo );
}
if( wpil.size() > 0 )
ffl.add( 0, Integer.MAX_VALUE, wpil );
return ffl;
}
}

View File

@ -4,7 +4,7 @@ import gnu.trove.set.hash.TCharHashSet;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
/** /**
* A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}. * A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.XSimpleBoundaryScanner}.
* <p/> * <p/>
* Uses specialized char set to lookup boundary, and fixes a problem with start offset in the * Uses specialized char set to lookup boundary, and fixes a problem with start offset in the
* beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem * beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem

View File

@ -18,20 +18,11 @@
*/ */
package org.elasticsearch.search.highlight; package org.elasticsearch.search.highlight;
import java.util.Map; import com.google.common.collect.Maps;
import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder; import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder; import org.apache.lucene.search.vectorhighlight.*;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder;
import org.apache.lucene.search.vectorhighlight.SingleFragListBuilder;
import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2; import org.elasticsearch.common.lucene.search.vectorhighlight.SimpleBoundaryScanner2;
@ -45,7 +36,7 @@ import org.elasticsearch.search.highlight.vectorhighlight.SourceScoreOrderFragme
import org.elasticsearch.search.highlight.vectorhighlight.SourceSimpleFragmentsBuilder; import org.elasticsearch.search.highlight.vectorhighlight.SourceSimpleFragmentsBuilder;
import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.internal.SearchContext;
import com.google.common.collect.Maps; import java.util.Map;
/** /**
* *
@ -85,10 +76,10 @@ public class FastVectorHighlighter implements Highlighter {
try { try {
MapperHighlightEntry entry = cache.mappers.get(mapper); MapperHighlightEntry entry = cache.mappers.get(mapper);
FieldQuery fieldQuery = null; XFieldQuery fieldQuery = null;
if (entry == null) { if (entry == null) {
FragListBuilder fragListBuilder; XFragListBuilder fragListBuilder;
BaseFragmentsBuilder fragmentsBuilder; XBaseFragmentsBuilder fragmentsBuilder;
BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT; BoundaryScanner boundaryScanner = SimpleBoundaryScanner2.DEFAULT;
if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) { if (field.boundaryMaxScan() != SimpleBoundaryScanner2.DEFAULT_MAX_SCAN || field.boundaryChars() != SimpleBoundaryScanner2.DEFAULT_BOUNDARY_CHARS) {
@ -96,7 +87,7 @@ public class FastVectorHighlighter implements Highlighter {
} }
if (field.numberOfFragments() == 0) { if (field.numberOfFragments() == 0) {
fragListBuilder = new SingleFragListBuilder(); fragListBuilder = new XSingleFragListBuilder();
if (mapper.fieldType().stored()) { if (mapper.fieldType().stored()) {
fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.preTags(), field.postTags(), boundaryScanner); fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.preTags(), field.postTags(), boundaryScanner);
@ -104,10 +95,10 @@ public class FastVectorHighlighter implements Highlighter {
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
} }
} else { } else {
fragListBuilder = field.fragmentOffset() == -1 ? new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fragmentOffset()); fragListBuilder = field.fragmentOffset() == -1 ? new XSimpleFragListBuilder() : new XSimpleFragListBuilder(field.fragmentOffset());
if (field.scoreOrdered()) { if (field.scoreOrdered()) {
if (mapper.fieldType().stored()) { if (mapper.fieldType().stored()) {
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner); fragmentsBuilder = new XScoreOrderFragmentsBuilder(field.preTags(), field.postTags(), boundaryScanner);
} else { } else {
fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner); fragmentsBuilder = new SourceScoreOrderFragmentsBuilder(mapper, context, field.preTags(), field.postTags(), boundaryScanner);
} }
@ -127,7 +118,7 @@ public class FastVectorHighlighter implements Highlighter {
// parameters to FVH are not requires since: // parameters to FVH are not requires since:
// first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch) // first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch)
// fragment builders are used explicitly // fragment builders are used explicitly
cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter(); cache.fvh = new org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter();
} }
CustomFieldQuery.highlightFilters.set(field.highlightFilter()); CustomFieldQuery.highlightFilters.set(field.highlightFilter());
if (field.requireFieldMatch()) { if (field.requireFieldMatch()) {
@ -166,16 +157,16 @@ public class FastVectorHighlighter implements Highlighter {
} }
private class MapperHighlightEntry { private class MapperHighlightEntry {
public FragListBuilder fragListBuilder; public XFragListBuilder fragListBuilder;
public FragmentsBuilder fragmentsBuilder; public XFragmentsBuilder fragmentsBuilder;
public org.apache.lucene.search.highlight.Highlighter highlighter; public org.apache.lucene.search.highlight.Highlighter highlighter;
} }
private class HighlighterEntry { private class HighlighterEntry {
public org.apache.lucene.search.vectorhighlight.FastVectorHighlighter fvh; public org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter fvh;
public FieldQuery noFieldMatchFieldQuery; public XFieldQuery noFieldMatchFieldQuery;
public FieldQuery fieldMatchFieldQuery; public XFieldQuery fieldMatchFieldQuery;
public Map<FieldMapper, MapperHighlightEntry> mappers = Maps.newHashMap(); public Map<FieldMapper, MapperHighlightEntry> mappers = Maps.newHashMap();
} }

View File

@ -22,10 +22,10 @@ package org.elasticsearch.search.highlight.vectorhighlight;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; import org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; import org.apache.lucene.search.vectorhighlight.XFragmentsBuilder;
import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.elasticsearch.index.analysis.*; import org.elasticsearch.index.analysis.*;
@ -35,7 +35,7 @@ import java.util.Comparator;
import java.util.List; import java.util.List;
/** /**
* Simple helper class for {@link FastVectorHighlighter} {@link FragmentsBuilder} implemenations. * Simple helper class for {@link XFastVectorHighlighter} {@link XFragmentsBuilder} implemenations.
*/ */
public final class FragmentBuilderHelper { public final class FragmentBuilderHelper {
@ -45,7 +45,7 @@ public final class FragmentBuilderHelper {
/** /**
* Fixes problems with broken analysis chains if positions and offsets are messed up that can lead to * Fixes problems with broken analysis chains if positions and offsets are messed up that can lead to
* {@link StringIndexOutOfBoundsException} in the {@link FastVectorHighlighter} * {@link StringIndexOutOfBoundsException} in the {@link XFastVectorHighlighter}
*/ */
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper<?> mapper, Field[] values, WeightedFragInfo fragInfo) { public static WeightedFragInfo fixWeightedFragInfo(FieldMapper<?> mapper, Field[] values, WeightedFragInfo fragInfo) {
assert fragInfo != null : "FragInfo must not be null"; assert fragInfo != null : "FragInfo must not be null";

View File

@ -21,14 +21,14 @@ package org.elasticsearch.search.highlight.vectorhighlight;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
/** /**
* Direct Subclass of Lucene's org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder * Direct Subclass of Lucene's org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder
* that corrects offsets for broken analysis chains. * that corrects offsets for broken analysis chains.
*/ */
public class SimpleFragmentsBuilder extends org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder { public class SimpleFragmentsBuilder extends org.apache.lucene.search.vectorhighlight.XSimpleFragmentsBuilder {
protected final FieldMapper<?> mapper; protected final FieldMapper<?> mapper;
public SimpleFragmentsBuilder(FieldMapper<?> mapper, public SimpleFragmentsBuilder(FieldMapper<?> mapper,

View File

@ -19,32 +19,25 @@
package org.elasticsearch.search.highlight.vectorhighlight; package org.elasticsearch.search.highlight.vectorhighlight;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ngram.NGramTokenizerFactory;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; import org.apache.lucene.search.vectorhighlight.XFieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.XScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.lookup.SearchLookup; import org.elasticsearch.search.lookup.SearchLookup;
import java.io.IOException;
import java.util.List;
/** /**
* *
*/ */
public class SourceScoreOrderFragmentsBuilder extends ScoreOrderFragmentsBuilder { public class SourceScoreOrderFragmentsBuilder extends XScoreOrderFragmentsBuilder {
private final FieldMapper<?> mapper; private final FieldMapper<?> mapper;

View File

@ -28,11 +28,9 @@ import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder; import org.elasticsearch.common.settings.ImmutableSettings.Builder;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.FilterBuilders; import org.elasticsearch.index.query.*;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder.Operator; import org.elasticsearch.index.query.MatchQueryBuilder.Operator;
import org.elasticsearch.index.query.MatchQueryBuilder.Type; import org.elasticsearch.index.query.MatchQueryBuilder.Type;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.rest.RestStatus; import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.builder.SearchSourceBuilder;
@ -1081,70 +1079,6 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
} }
@Test
public void testDisableFastVectorHighlighter() throws Exception {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").endObject()
.endObject().endObject().endObject())
.execute().actionGet();
ensureGreen();
for (int i = 0; i < 5; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource("title", "This is a test for the workaround for the fast vector highlighting SOLR-3724").execute().actionGet();
}
refresh();
SearchResponse search = client().prepareSearch()
.setQuery(matchPhraseQuery("title", "test for the workaround"))
.addHighlightedField("title", 50, 1, 10)
.execute().actionGet();
assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0));
assertThat(search.getHits().totalHits(), equalTo(5l));
assertThat(search.getHits().hits().length, equalTo(5));
for (SearchHit hit : search.getHits()) {
// Because of SOLR-3724 nothing is highlighted when FVH is used
assertThat(hit.highlightFields().isEmpty(), equalTo(true));
}
// Using plain highlighter instead of FVH
search = client().prepareSearch()
.setQuery(matchPhraseQuery("title", "test for the workaround"))
.addHighlightedField("title", 50, 1, 10)
.setHighlighterType("highlighter")
.execute().actionGet();
assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0));
assertThat(search.getHits().totalHits(), equalTo(5l));
assertThat(search.getHits().hits().length, equalTo(5));
for (SearchHit hit : search.getHits()) {
// With plain highlighter terms are highlighted correctly
assertThat(hit.highlightFields().get("title").fragments()[0].string(), equalTo("This is a <em>test</em> for the <em>workaround</em> for the fast vector highlighting SOLR-3724"));
}
// Using plain highlighter instead of FVH on the field level
search = client().prepareSearch()
.setQuery(matchPhraseQuery("title", "test for the workaround"))
.addHighlightedField(new HighlightBuilder.Field("title").highlighterType("highlighter"))
.setHighlighterType("highlighter")
.execute().actionGet();
assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0));
assertThat(search.getHits().totalHits(), equalTo(5l));
assertThat(search.getHits().hits().length, equalTo(5));
for (SearchHit hit : search.getHits()) {
// With plain highlighter terms are highlighted correctly
assertThat(hit.highlightFields().get("title").fragments()[0].string(), equalTo("This is a <em>test</em> for the <em>workaround</em> for the fast vector highlighting SOLR-3724"));
}
}
@Test @Test
public void testFSHHighlightAllMvFragments() throws Exception { public void testFSHHighlightAllMvFragments() throws Exception {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder() client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder()
@ -1534,4 +1468,54 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
assertThat(response.getFailedShards(), equalTo(0)); assertThat(response.getFailedShards(), equalTo(0));
} }
@Test
public void testHighlightComplexPhraseQuery() throws Exception {
prepareCreate("test")
.setSettings(ImmutableSettings.builder()
.put("analysis.analyzer.code.type", "custom")
.put("analysis.analyzer.code.tokenizer", "code")
.put("analysis.analyzer.code.filter", "code,lowercase")
.put("analysis.tokenizer.code.type", "pattern")
.put("analysis.tokenizer.code.pattern", "[.,:;/\"<>(){}\\[\\]\\s]")
.put("analysis.filter.code.type", "word_delimiter")
.put("analysis.filter.code.generate_word_parts", "true")
.put("analysis.filter.code.generate_number_parts", "true")
.put("analysis.filter.code.catenate_words", "false")
.put("analysis.filter.code.catenate_numbers", "false")
.put("analysis.filter.code.catenate_all", "false")
.put("analysis.filter.code.split_on_case_change", "true")
.put("analysis.filter.code.preserve_original", "true")
.put("analysis.filter.code.split_on_numerics", "true")
.put("analysis.filter.code.stem_english_possessive", "false")
.build())
.addMapping("type", jsonBuilder()
.startObject()
.startObject("type")
.startObject("properties")
.startObject("text")
.field("type", "string")
.field("analyzer", "code")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject()
.endObject()
.endObject())
.execute().actionGet();
ensureGreen();
client().prepareIndex("test", "type", "1")
.setSource(jsonBuilder().startObject()
.field("text", "def log_worker_status( worker )\n pass")
.endObject())
.setRefresh(true)
.execute().actionGet();
SearchResponse response = client().prepareSearch("test")
.setQuery(QueryBuilders.matchPhraseQuery("text", "def log_worker_status( worker )"))
.addHighlightedField("text").execute().actionGet();
assertThat(response.getFailedShards(), equalTo(0));
assertThat(response.getHits().totalHits(), equalTo(1L));
assertThat(response.getHits().getAt(0).getHighlightFields().get("text").fragments()[0].string(), equalTo("<em>def log_worker_status( worker</em> )\n pass"));
}
} }

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.search.vectorhighlight.CustomFieldQuery; import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; import org.apache.lucene.search.vectorhighlight.XFastVectorHighlighter;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.Lucene;
@ -60,7 +60,7 @@ public class VectorHighlighterTests {
assertThat(topDocs.totalHits, equalTo(1)); assertThat(topDocs.totalHits, equalTo(1));
FastVectorHighlighter highlighter = new FastVectorHighlighter(); XFastVectorHighlighter highlighter = new XFastVectorHighlighter();
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
reader, topDocs.scoreDocs[0].doc, "content", 30); reader, topDocs.scoreDocs[0].doc, "content", 30);
assertThat(fragment, notNullValue()); assertThat(fragment, notNullValue());
@ -83,7 +83,7 @@ public class VectorHighlighterTests {
assertThat(topDocs.totalHits, equalTo(1)); assertThat(topDocs.totalHits, equalTo(1));
FastVectorHighlighter highlighter = new FastVectorHighlighter(); XFastVectorHighlighter highlighter = new XFastVectorHighlighter();
PrefixQuery prefixQuery = new PrefixQuery(new Term("content", "ba")); PrefixQuery prefixQuery = new PrefixQuery(new Term("content", "ba"));
assertThat(prefixQuery.getRewriteMethod().getClass().getName(), equalTo(PrefixQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT.getClass().getName())); assertThat(prefixQuery.getRewriteMethod().getClass().getName(), equalTo(PrefixQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT.getClass().getName()));
@ -125,7 +125,7 @@ public class VectorHighlighterTests {
assertThat(topDocs.totalHits, equalTo(1)); assertThat(topDocs.totalHits, equalTo(1));
FastVectorHighlighter highlighter = new FastVectorHighlighter(); XFastVectorHighlighter highlighter = new XFastVectorHighlighter();
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
reader, topDocs.scoreDocs[0].doc, "content", 30); reader, topDocs.scoreDocs[0].doc, "content", 30);
assertThat(fragment, nullValue()); assertThat(fragment, nullValue());
@ -147,7 +147,7 @@ public class VectorHighlighterTests {
assertThat(topDocs.totalHits, equalTo(1)); assertThat(topDocs.totalHits, equalTo(1));
FastVectorHighlighter highlighter = new FastVectorHighlighter(); XFastVectorHighlighter highlighter = new XFastVectorHighlighter();
String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))), String fragment = highlighter.getBestFragment(highlighter.getFieldQuery(new TermQuery(new Term("content", "bad"))),
reader, topDocs.scoreDocs[0].doc, "content", 30); reader, topDocs.scoreDocs[0].doc, "content", 30);
assertThat(fragment, nullValue()); assertThat(fragment, nullValue());