mirror of https://github.com/apache/lucene.git
LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
respect field boundaries in the case of multivalued fields. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1362223 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
963efbfea2
commit
858dbbdba4
|
@ -18,6 +18,9 @@ New features
|
|||
create automata from a fixed collection of UTF-8 encoded BytesRef
|
||||
(Dawid Weiss, Robert Muir)
|
||||
|
||||
* LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
|
||||
respect field boundaries in the case of multivalued fields. (Martijn van Groningen)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
|
||||
|
|
|
@ -17,10 +17,6 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
|
@ -29,10 +25,19 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
|
||||
|
||||
protected String[] preTags, postTags;
|
||||
|
@ -48,6 +53,7 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
|
|||
public static final String[] COLORED_POST_TAGS = { "</b>" };
|
||||
private char multiValuedSeparator = ' ';
|
||||
private final BoundaryScanner boundaryScanner;
|
||||
private boolean discreteMultiValueHighlighting = false;
|
||||
|
||||
protected BaseFragmentsBuilder(){
|
||||
this( new String[]{ "<b>" }, new String[]{ "</b>" } );
|
||||
|
@ -76,7 +82,7 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
|
|||
public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src );
|
||||
|
||||
private static final Encoder NULL_ENCODER = new DefaultEncoder();
|
||||
|
||||
|
||||
public String createFragment( IndexReader reader, int docId,
|
||||
String fieldName, FieldFragList fieldFragList ) throws IOException {
|
||||
return createFragment( reader, docId, fieldName, fieldFragList,
|
||||
|
@ -102,14 +108,23 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
|
|||
public String[] createFragments( IndexReader reader, int docId,
|
||||
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
|
||||
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
|
||||
if( maxNumFragments < 0 )
|
||||
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
|
||||
|
||||
List<WeightedFragInfo> fragInfos = getWeightedFragInfoList( fieldFragList.getFragInfos() );
|
||||
|
||||
if( maxNumFragments < 0 ) {
|
||||
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
|
||||
}
|
||||
|
||||
List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
|
||||
List<String> fragments = new ArrayList<String>( maxNumFragments );
|
||||
Field[] values = getFields( reader, docId, fieldName );
|
||||
if( values.length == 0 ) return null;
|
||||
if( values.length == 0 ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (discreteMultiValueHighlighting && values.length > 1) {
|
||||
fragInfos = discreteMultiValueHighlighting(fragInfos, values);
|
||||
}
|
||||
|
||||
fragInfos = getWeightedFragInfoList(fragInfos);
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
int[] nextValueIndex = { 0 };
|
||||
for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
|
||||
|
@ -186,7 +201,92 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
|
|||
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
|
||||
return buffer.substring( startOffset, eo );
|
||||
}
|
||||
|
||||
|
||||
protected List<WeightedFragInfo> discreteMultiValueHighlighting(List<WeightedFragInfo> fragInfos, Field[] fields) {
|
||||
Map<String, List<WeightedFragInfo>> fieldNameToFragInfos = new HashMap<String, List<WeightedFragInfo>>();
|
||||
for (Field field : fields) {
|
||||
fieldNameToFragInfos.put(field.name(), new ArrayList<WeightedFragInfo>());
|
||||
}
|
||||
|
||||
fragInfos: for (WeightedFragInfo fragInfo : fragInfos) {
|
||||
int fieldStart;
|
||||
int fieldEnd = 0;
|
||||
for (Field field : fields) {
|
||||
if (field.stringValue().isEmpty()) {
|
||||
fieldEnd++;
|
||||
continue;
|
||||
}
|
||||
fieldStart = fieldEnd;
|
||||
fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
|
||||
|
||||
if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart &&
|
||||
fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
|
||||
fieldNameToFragInfos.get(field.name()).add(fragInfo);
|
||||
continue fragInfos;
|
||||
}
|
||||
|
||||
if (fragInfo.getSubInfos().isEmpty()) {
|
||||
continue fragInfos;
|
||||
}
|
||||
|
||||
Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
|
||||
if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int fragStart = fieldStart;
|
||||
if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
|
||||
fragStart = fragInfo.getStartOffset();
|
||||
}
|
||||
|
||||
int fragEnd = fieldEnd;
|
||||
if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
|
||||
fragEnd = fragInfo.getEndOffset();
|
||||
}
|
||||
|
||||
|
||||
List<SubInfo> subInfos = new ArrayList<SubInfo>();
|
||||
WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, fragInfo.getTotalBoost());
|
||||
|
||||
Iterator<SubInfo> subInfoIterator = fragInfo.getSubInfos().iterator();
|
||||
while (subInfoIterator.hasNext()) {
|
||||
SubInfo subInfo = subInfoIterator.next();
|
||||
List<Toffs> toffsList = new ArrayList<Toffs>();
|
||||
Iterator<Toffs> toffsIterator = subInfo.getTermsOffsets().iterator();
|
||||
while (toffsIterator.hasNext()) {
|
||||
Toffs toffs = toffsIterator.next();
|
||||
if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) {
|
||||
toffsList.add(toffs);
|
||||
toffsIterator.remove();
|
||||
}
|
||||
}
|
||||
if (!toffsList.isEmpty()) {
|
||||
subInfos.add(new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum()));
|
||||
}
|
||||
|
||||
if (subInfo.getTermsOffsets().isEmpty()) {
|
||||
subInfoIterator.remove();
|
||||
}
|
||||
}
|
||||
fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
|
||||
}
|
||||
}
|
||||
|
||||
List<WeightedFragInfo> result = new ArrayList<WeightedFragInfo>();
|
||||
for (List<WeightedFragInfo> weightedFragInfos : fieldNameToFragInfos.values()) {
|
||||
result.addAll(weightedFragInfos);
|
||||
}
|
||||
Collections.sort(result, new Comparator<WeightedFragInfo>() {
|
||||
|
||||
public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
|
||||
return info1.getStartOffset() - info2.getStartOffset();
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setMultiValuedSeparator( char separator ){
|
||||
multiValuedSeparator = separator;
|
||||
}
|
||||
|
@ -195,6 +295,14 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
|
|||
return multiValuedSeparator;
|
||||
}
|
||||
|
||||
public boolean isDiscreteMultiValueHighlighting() {
|
||||
return discreteMultiValueHighlighting;
|
||||
}
|
||||
|
||||
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
|
||||
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
|
||||
}
|
||||
|
||||
protected String getPreTag( int num ){
|
||||
return getPreTag( preTags, num );
|
||||
}
|
||||
|
|
|
@ -17,12 +17,12 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
|
||||
* to create fragments (snippets).
|
||||
|
@ -116,7 +116,11 @@ public abstract class FieldFragList {
|
|||
public int getSeqnum(){
|
||||
return seqnum;
|
||||
}
|
||||
|
||||
|
||||
public String getText(){
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
|
|
@ -17,20 +17,32 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class SimpleFragmentsBuilderTest extends AbstractTestCase {
|
||||
|
||||
|
@ -175,4 +187,152 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
|
|||
sfb.setMultiValuedSeparator( '/' );
|
||||
assertEquals( "//a b c//<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
|
||||
}
|
||||
|
||||
public void testDiscreteMultiValueHighlighting() throws Exception {
|
||||
makeIndexShortMV();
|
||||
|
||||
FieldQuery fq = new FieldQuery( tq( "d" ), true, true );
|
||||
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
|
||||
FieldPhraseList fpl = new FieldPhraseList( stack, fq );
|
||||
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
|
||||
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
|
||||
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
|
||||
sfb.setDiscreteMultiValueHighlighting(true);
|
||||
assertEquals( "<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
|
||||
|
||||
make1dmfIndex("some text to highlight", "highlight other text");
|
||||
fq = new FieldQuery( tq( "text" ), true, true );
|
||||
stack = new FieldTermStack( reader, 0, F, fq );
|
||||
fpl = new FieldPhraseList( stack, fq );
|
||||
sflb = new SimpleFragListBuilder();
|
||||
ffl = sflb.createFieldFragList( fpl, 32 );
|
||||
String[] result = sfb.createFragments(reader, 0, F, ffl, 3);
|
||||
assertEquals(2, result.length);
|
||||
assertEquals("some <b>text</b> to highlight", result[0]);
|
||||
assertEquals("other <b>text</b>", result[1]);
|
||||
|
||||
fq = new FieldQuery( tq( "highlight" ), true, true );
|
||||
stack = new FieldTermStack( reader, 0, F, fq );
|
||||
fpl = new FieldPhraseList( stack, fq );
|
||||
sflb = new SimpleFragListBuilder();
|
||||
ffl = sflb.createFieldFragList( fpl, 32 );
|
||||
result = sfb.createFragments(reader, 0, F, ffl, 3);
|
||||
assertEquals(2, result.length);
|
||||
assertEquals("text to <b>highlight</b>", result[0]);
|
||||
assertEquals("<b>highlight</b> other text", result[1]);
|
||||
}
|
||||
|
||||
public void testRandomDiscreteMultiValueHighlighting() throws Exception {
|
||||
String[] randomValues = new String[3 + random().nextInt(10 * RANDOM_MULTIPLIER)];
|
||||
for (int i = 0; i < randomValues.length; i++) {
|
||||
String randomValue;
|
||||
do {
|
||||
randomValue = _TestUtil.randomSimpleString(random());
|
||||
} while ("".equals(randomValue));
|
||||
randomValues[i] = randomValue;
|
||||
}
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(
|
||||
random(),
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
FieldType customType = new FieldType(TextField.TYPE_STORED);
|
||||
customType.setStoreTermVectors(true);
|
||||
customType.setStoreTermVectorOffsets(true);
|
||||
customType.setStoreTermVectorPositions(true);
|
||||
|
||||
int numDocs = randomValues.length * 5;
|
||||
int numFields = 2 + random().nextInt(5);
|
||||
int numTerms = 2 + random().nextInt(3);
|
||||
List<Doc> docs = new ArrayList<Doc>(numDocs);
|
||||
List<Document> documents = new ArrayList<Document>(numDocs);
|
||||
Map<String, Set<Integer>> valueToDocId = new HashMap<String, Set<Integer>>();
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document document = new Document();
|
||||
String[][] fields = new String[numFields][numTerms];
|
||||
for (int j = 0; j < numFields; j++) {
|
||||
String[] fieldValues = new String[numTerms];
|
||||
fieldValues[0] = getRandomValue(randomValues, valueToDocId, i);
|
||||
StringBuilder builder = new StringBuilder(fieldValues[0]);
|
||||
for (int k = 1; k < numTerms; k++) {
|
||||
fieldValues[k] = getRandomValue(randomValues, valueToDocId, i);
|
||||
builder.append(' ').append(fieldValues[k]);
|
||||
}
|
||||
document.add(new Field(F, builder.toString(), customType));
|
||||
fields[j] = fieldValues;
|
||||
}
|
||||
docs.add(new Doc(fields));
|
||||
documents.add(document);
|
||||
}
|
||||
writer.addDocuments(documents);
|
||||
writer.close();
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
|
||||
try {
|
||||
int highlightIters = 1 + random().nextInt(120 * RANDOM_MULTIPLIER);
|
||||
for (int highlightIter = 0; highlightIter < highlightIters; highlightIter++) {
|
||||
String queryTerm = randomValues[random().nextInt(randomValues.length)];
|
||||
int randomHit = valueToDocId.get(queryTerm).iterator().next();
|
||||
List<StringBuilder> builders = new ArrayList<StringBuilder>();
|
||||
for (String[] fieldValues : docs.get(randomHit).fieldValues) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
boolean hit = false;
|
||||
for (int i = 0; i < fieldValues.length; i++) {
|
||||
if (queryTerm.equals(fieldValues[i])) {
|
||||
builder.append("<b>").append(queryTerm).append("</b>");
|
||||
hit = true;
|
||||
} else {
|
||||
builder.append(fieldValues[i]);
|
||||
}
|
||||
if (i != fieldValues.length - 1) {
|
||||
builder.append(' ');
|
||||
}
|
||||
}
|
||||
if (hit) {
|
||||
builders.add(builder);
|
||||
}
|
||||
}
|
||||
|
||||
FieldQuery fq = new FieldQuery(tq(queryTerm), true, true);
|
||||
FieldTermStack stack = new FieldTermStack(reader, randomHit, F, fq);
|
||||
|
||||
FieldPhraseList fpl = new FieldPhraseList(stack, fq);
|
||||
SimpleFragListBuilder sflb = new SimpleFragListBuilder(100);
|
||||
FieldFragList ffl = sflb.createFieldFragList(fpl, 300);
|
||||
|
||||
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
|
||||
sfb.setDiscreteMultiValueHighlighting(true);
|
||||
String[] actualFragments = sfb.createFragments(reader, randomHit, F, ffl, numFields);
|
||||
assertEquals(builders.size(), actualFragments.length);
|
||||
for (int i = 0; i < actualFragments.length; i++) {
|
||||
assertEquals(builders.get(i).toString(), actualFragments[i]);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
private String getRandomValue(String[] randomValues, Map<String, Set<Integer>> valueToDocId, int docId) {
|
||||
String value = randomValues[random().nextInt(randomValues.length)];
|
||||
if (!valueToDocId.containsKey(value)) {
|
||||
valueToDocId.put(value, new HashSet<Integer>());
|
||||
}
|
||||
valueToDocId.get(value).add(docId);
|
||||
return value;
|
||||
}
|
||||
|
||||
private static class Doc {
|
||||
|
||||
final String[][] fieldValues;
|
||||
|
||||
private Doc(String[][] fieldValues) {
|
||||
this.fieldValues = fieldValues;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue