LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to

respect field boundaries in the case of multivalued fields.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1362223 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Martijn van Groningen 2012-07-16 19:24:49 +00:00
parent 963efbfea2
commit 858dbbdba4
4 changed files with 291 additions and 16 deletions

View File

@ -18,6 +18,9 @@ New features
create automata from a fixed collection of UTF-8 encoded BytesRef
(Dawid Weiss, Robert Muir)
* LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
respect field boundaries in the case of multivalued fields. (Martijn van Groningen)
API Changes
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.

View File

@ -17,10 +17,6 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
@ -29,10 +25,19 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
protected String[] preTags, postTags;
@ -48,6 +53,7 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
public static final String[] COLORED_POST_TAGS = { "</b>" };
private char multiValuedSeparator = ' ';
private final BoundaryScanner boundaryScanner;
private boolean discreteMultiValueHighlighting = false;
protected BaseFragmentsBuilder(){
this( new String[]{ "<b>" }, new String[]{ "</b>" } );
@ -76,7 +82,7 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src );
private static final Encoder NULL_ENCODER = new DefaultEncoder();
public String createFragment( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList ) throws IOException {
return createFragment( reader, docId, fieldName, fieldFragList,
@ -102,14 +108,23 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
public String[] createFragments( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
if( maxNumFragments < 0 )
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
List<WeightedFragInfo> fragInfos = getWeightedFragInfoList( fieldFragList.getFragInfos() );
if( maxNumFragments < 0 ) {
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
}
List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
List<String> fragments = new ArrayList<String>( maxNumFragments );
Field[] values = getFields( reader, docId, fieldName );
if( values.length == 0 ) return null;
if( values.length == 0 ) {
return null;
}
if (discreteMultiValueHighlighting && values.length > 1) {
fragInfos = discreteMultiValueHighlighting(fragInfos, values);
}
fragInfos = getWeightedFragInfoList(fragInfos);
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = { 0 };
for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
@ -186,7 +201,92 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
return buffer.substring( startOffset, eo );
}
protected List<WeightedFragInfo> discreteMultiValueHighlighting(List<WeightedFragInfo> fragInfos, Field[] fields) {
Map<String, List<WeightedFragInfo>> fieldNameToFragInfos = new HashMap<String, List<WeightedFragInfo>>();
for (Field field : fields) {
fieldNameToFragInfos.put(field.name(), new ArrayList<WeightedFragInfo>());
}
fragInfos: for (WeightedFragInfo fragInfo : fragInfos) {
int fieldStart;
int fieldEnd = 0;
for (Field field : fields) {
if (field.stringValue().isEmpty()) {
fieldEnd++;
continue;
}
fieldStart = fieldEnd;
fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart &&
fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
fieldNameToFragInfos.get(field.name()).add(fragInfo);
continue fragInfos;
}
if (fragInfo.getSubInfos().isEmpty()) {
continue fragInfos;
}
Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
continue;
}
int fragStart = fieldStart;
if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
fragStart = fragInfo.getStartOffset();
}
int fragEnd = fieldEnd;
if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
fragEnd = fragInfo.getEndOffset();
}
List<SubInfo> subInfos = new ArrayList<SubInfo>();
WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, fragInfo.getTotalBoost());
Iterator<SubInfo> subInfoIterator = fragInfo.getSubInfos().iterator();
while (subInfoIterator.hasNext()) {
SubInfo subInfo = subInfoIterator.next();
List<Toffs> toffsList = new ArrayList<Toffs>();
Iterator<Toffs> toffsIterator = subInfo.getTermsOffsets().iterator();
while (toffsIterator.hasNext()) {
Toffs toffs = toffsIterator.next();
if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) {
toffsList.add(toffs);
toffsIterator.remove();
}
}
if (!toffsList.isEmpty()) {
subInfos.add(new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum()));
}
if (subInfo.getTermsOffsets().isEmpty()) {
subInfoIterator.remove();
}
}
fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
}
}
List<WeightedFragInfo> result = new ArrayList<WeightedFragInfo>();
for (List<WeightedFragInfo> weightedFragInfos : fieldNameToFragInfos.values()) {
result.addAll(weightedFragInfos);
}
Collections.sort(result, new Comparator<WeightedFragInfo>() {
public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
return info1.getStartOffset() - info2.getStartOffset();
}
});
return result;
}
public void setMultiValuedSeparator( char separator ){
multiValuedSeparator = separator;
}
@ -195,6 +295,14 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
return multiValuedSeparator;
}
public boolean isDiscreteMultiValueHighlighting() {
return discreteMultiValueHighlighting;
}
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
}
protected String getPreTag( int num ){
return getPreTag( preTags, num );
}

View File

@ -17,12 +17,12 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
import java.util.ArrayList;
import java.util.List;
/**
* FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
* to create fragments (snippets).
@ -116,7 +116,11 @@ public abstract class FieldFragList {
public int getSeqnum(){
return seqnum;
}
public String getText(){
return text;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();

View File

@ -17,20 +17,32 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util._TestUtil;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class SimpleFragmentsBuilderTest extends AbstractTestCase {
@ -175,4 +187,152 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
sfb.setMultiValuedSeparator( '/' );
assertEquals( "//a b c//<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
}
public void testDiscreteMultiValueHighlighting() throws Exception {
makeIndexShortMV();
FieldQuery fq = new FieldQuery( tq( "d" ), true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
FieldPhraseList fpl = new FieldPhraseList( stack, fq );
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
sfb.setDiscreteMultiValueHighlighting(true);
assertEquals( "<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
make1dmfIndex("some text to highlight", "highlight other text");
fq = new FieldQuery( tq( "text" ), true, true );
stack = new FieldTermStack( reader, 0, F, fq );
fpl = new FieldPhraseList( stack, fq );
sflb = new SimpleFragListBuilder();
ffl = sflb.createFieldFragList( fpl, 32 );
String[] result = sfb.createFragments(reader, 0, F, ffl, 3);
assertEquals(2, result.length);
assertEquals("some <b>text</b> to highlight", result[0]);
assertEquals("other <b>text</b>", result[1]);
fq = new FieldQuery( tq( "highlight" ), true, true );
stack = new FieldTermStack( reader, 0, F, fq );
fpl = new FieldPhraseList( stack, fq );
sflb = new SimpleFragListBuilder();
ffl = sflb.createFieldFragList( fpl, 32 );
result = sfb.createFragments(reader, 0, F, ffl, 3);
assertEquals(2, result.length);
assertEquals("text to <b>highlight</b>", result[0]);
assertEquals("<b>highlight</b> other text", result[1]);
}
public void testRandomDiscreteMultiValueHighlighting() throws Exception {
String[] randomValues = new String[3 + random().nextInt(10 * RANDOM_MULTIPLIER)];
for (int i = 0; i < randomValues.length; i++) {
String randomValue;
do {
randomValue = _TestUtil.randomSimpleString(random());
} while ("".equals(randomValue));
randomValues[i] = randomValue;
}
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(
random(),
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
FieldType customType = new FieldType(TextField.TYPE_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
int numDocs = randomValues.length * 5;
int numFields = 2 + random().nextInt(5);
int numTerms = 2 + random().nextInt(3);
List<Doc> docs = new ArrayList<Doc>(numDocs);
List<Document> documents = new ArrayList<Document>(numDocs);
Map<String, Set<Integer>> valueToDocId = new HashMap<String, Set<Integer>>();
for (int i = 0; i < numDocs; i++) {
Document document = new Document();
String[][] fields = new String[numFields][numTerms];
for (int j = 0; j < numFields; j++) {
String[] fieldValues = new String[numTerms];
fieldValues[0] = getRandomValue(randomValues, valueToDocId, i);
StringBuilder builder = new StringBuilder(fieldValues[0]);
for (int k = 1; k < numTerms; k++) {
fieldValues[k] = getRandomValue(randomValues, valueToDocId, i);
builder.append(' ').append(fieldValues[k]);
}
document.add(new Field(F, builder.toString(), customType));
fields[j] = fieldValues;
}
docs.add(new Doc(fields));
documents.add(document);
}
writer.addDocuments(documents);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
try {
int highlightIters = 1 + random().nextInt(120 * RANDOM_MULTIPLIER);
for (int highlightIter = 0; highlightIter < highlightIters; highlightIter++) {
String queryTerm = randomValues[random().nextInt(randomValues.length)];
int randomHit = valueToDocId.get(queryTerm).iterator().next();
List<StringBuilder> builders = new ArrayList<StringBuilder>();
for (String[] fieldValues : docs.get(randomHit).fieldValues) {
StringBuilder builder = new StringBuilder();
boolean hit = false;
for (int i = 0; i < fieldValues.length; i++) {
if (queryTerm.equals(fieldValues[i])) {
builder.append("<b>").append(queryTerm).append("</b>");
hit = true;
} else {
builder.append(fieldValues[i]);
}
if (i != fieldValues.length - 1) {
builder.append(' ');
}
}
if (hit) {
builders.add(builder);
}
}
FieldQuery fq = new FieldQuery(tq(queryTerm), true, true);
FieldTermStack stack = new FieldTermStack(reader, randomHit, F, fq);
FieldPhraseList fpl = new FieldPhraseList(stack, fq);
SimpleFragListBuilder sflb = new SimpleFragListBuilder(100);
FieldFragList ffl = sflb.createFieldFragList(fpl, 300);
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
sfb.setDiscreteMultiValueHighlighting(true);
String[] actualFragments = sfb.createFragments(reader, randomHit, F, ffl, numFields);
assertEquals(builders.size(), actualFragments.length);
for (int i = 0; i < actualFragments.length; i++) {
assertEquals(builders.get(i).toString(), actualFragments[i]);
}
}
} finally {
reader.close();
dir.close();
}
}
private String getRandomValue(String[] randomValues, Map<String, Set<Integer>> valueToDocId, int docId) {
String value = randomValues[random().nextInt(randomValues.length)];
if (!valueToDocId.containsKey(value)) {
valueToDocId.put(value, new HashSet<Integer>());
}
valueToDocId.get(value).add(docId);
return value;
}
private static class Doc {
final String[][] fieldValues;
private Doc(String[][] fieldValues) {
this.fieldValues = fieldValues;
}
}
}