From c831eb9c14ccf849db369281d15d7d8bf5aa8a8b Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Thu, 25 Feb 2010 02:24:31 +0000 Subject: [PATCH] LUCENE-2278: FastVectorHighlighter: highlighted term is out of alignment in multi-valued NOT_ANALYZED field git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@916090 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 3 ++ .../vectorhighlight/BaseFragmentsBuilder.java | 35 ++++++++++++++++-- .../vectorhighlight/AbstractTestCase.java | 36 +++++++++++++++++++ .../SimpleFragmentsBuilderTest.java | 12 +++++++ 4 files changed, 83 insertions(+), 3 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 80fae983512..64a361edf7c 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -48,6 +48,9 @@ Bug fixes the previous code did not always implement the Snowball algorithm correctly. Additionally, for Version > 3.0, the Snowball stopword lists are used by default. (Robert Muir, Uwe Schindler, Simon Willnauer) + + * LUCENE-2278: FastVectorHighlighter: Highlighted term is out of alignment + in multi-valued NOT_ANALYZED field. (Koji Sekiguchi) API Changes diff --git a/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java b/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java index 307e9c6f37f..9b22433bbed 100644 --- a/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java +++ b/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.MapFieldSelector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; @@ -72,7 +73,7 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { List fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos ); List fragments = new ArrayList( maxNumFragments ); - String[] values = getFieldValues( reader, docId, fieldName ); + Field[] values = getFields( reader, docId, fieldName ); if( values.length == 0 ) return null; StringBuilder buffer = new StringBuilder(); int[] nextValueIndex = { 0 }; @@ -83,15 +84,31 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { return fragments.toArray( new String[fragments.size()] ); } + @Deprecated protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException { Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null } + + protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException { + // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field??? + Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); + return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null + } + @Deprecated protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){ - StringBuilder fragment = new StringBuilder(); final int s = fragInfo.startOffset; - String src = getFragmentSource( buffer, index, values, s, fragInfo.endOffset ); + return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); + } + + protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo ){ + final int s = fragInfo.startOffset; + return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); + } + + private String makeFragment( WeightedFragInfo fragInfo, String src, int s ){ + StringBuilder fragment = new StringBuilder(); int srcIndex = 0; for( SubInfo subInfo : fragInfo.subInfos ){ for( Toffs to : subInfo.termsOffsets ){ @@ -104,6 +121,7 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { return fragment.toString(); } + @Deprecated protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values, int startOffset, int endOffset ){ while( buffer.length() < endOffset && index[0] < values.length ){ @@ -114,6 +132,17 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { int eo = buffer.length() < endOffset ? buffer.length() : endOffset; return buffer.substring( startOffset, eo ); } + + protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + if( index[0] > 0 && values[index[0]].isTokenized() && values[index[0]].stringValue().length() > 0 ) + buffer.append( ' ' ); + buffer.append( values[index[0]++].stringValue() ); + } + int eo = buffer.length() < endOffset ? buffer.length() : endOffset; + return buffer.substring( startOffset, eo ); + } protected String getPreTag( int num ){ return preTags.length > num ? preTags[num] : preTags[0]; diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java index cfc2ceebd27..979967dcc9b 100644 --- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java +++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java @@ -24,6 +24,7 @@ import java.util.Collection; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -56,6 +57,7 @@ public abstract class AbstractTestCase extends TestCase { protected Directory dir; protected Analyzer analyzerW; protected Analyzer analyzerB; + protected Analyzer analyzerK; protected IndexReader reader; protected QueryParser paW; protected QueryParser paB; @@ -76,11 +78,18 @@ public abstract class AbstractTestCase extends TestCase { "\nLucene/Solr does not require such additional hardware.", "\nWhen you talk about processing speed, the" }; + + protected static final String[] strMVValues = { + "abc", + "defg", + "hijkl" + }; @Override protected void setUp() throws Exception { analyzerW = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); analyzerB = new BigramAnalyzer(); + analyzerK = new KeywordAnalyzer(); paW = new QueryParser(Version.LUCENE_CURRENT, F, analyzerW ); paB = new QueryParser(Version.LUCENE_CURRENT, F, analyzerB ); dir = new RAMDirectory(); @@ -314,6 +323,7 @@ public abstract class AbstractTestCase extends TestCase { make1dmfIndex( analyzerB, values ); } + // make 1 doc with multi valued field protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception { IndexWriter writer = new IndexWriter( dir, analyzer, true, MaxFieldLength.LIMITED ); Document doc = new Document(); @@ -325,6 +335,18 @@ public abstract class AbstractTestCase extends TestCase { reader = IndexReader.open( dir, true ); } + // make 1 doc with multi valued & not analyzed field + protected void make1dmfIndexNA( String... values ) throws Exception { + IndexWriter writer = new IndexWriter( dir, analyzerK, true, MaxFieldLength.LIMITED ); + Document doc = new Document(); + for( String value: values ) + doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); + writer.addDocument( doc ); + writer.close(); + + reader = IndexReader.open( dir, true ); + } + protected void makeIndexShortMV() throws Exception { // 012345 @@ -386,4 +408,18 @@ public abstract class AbstractTestCase extends TestCase { make1dmfIndexB( biMVValues ); } + + protected void makeIndexStrMV() throws Exception { + + // 0123 + // "abc" + + // 34567 + // "defg" + + // 111 + // 789012 + // "hijkl" + make1dmfIndexNA( strMVValues ); + } } diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java index 5e017d04da8..0fe683c20ab 100644 --- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java +++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java @@ -127,4 +127,16 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { reader = IndexReader.open( dir, true ); } + + public void test1StrMV() throws Exception { + makeIndexStrMV(); + + FieldQuery fq = new FieldQuery( tq( "defg" ), true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + SimpleFragListBuilder sflb = new SimpleFragListBuilder(); + FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); + SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); + assertEquals( "abcdefghijkl", sfb.createFragment( reader, 0, F, ffl ) ); + } }