LUCENE-4479: Fix highlighter on index w/ positions but no offsets

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400504 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Alan Woodward 2012-10-20 19:36:02 +00:00
parent 7e45db75e3
commit 61d9cc7c99
6 changed files with 108 additions and 52 deletions

View File

@ -49,6 +49,10 @@ API Changes
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
no longer seek when writing. (Adrien Grand, Robert Muir)
* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String)
to TokenStream.getTokenStreamWithOffsets, and return null on failure
rather than throwing IllegalArgumentException. (Alan Woodward)
* LUCENE-4472: MergePolicy now accepts a MergeTrigger that provides
information about the trigger of the merge ie. merge triggered due
to a segment merge or a full flush etc. (Simon Willnauer)
@ -67,6 +71,9 @@ Bug Fixes
* LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
these counts now all exclude deleted documents. (Mike McCandless)
* LUCENE-4479: Highlighter works correctly for fields with term vector
positions, but no offsets. (Alan Woodward)
Optimizations
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets

View File

@ -20,28 +20,26 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Comparator;
/**
* Hides implementation issues associated with obtaining a TokenStream for use
* with the higlighter - can obtain from TermFreqVectors with offsets and
@ -121,14 +119,10 @@ public class TokenSources {
return getTokenStream(vector, false);
}
private static boolean hasPositions(Terms vector) throws IOException {
return vector.hasPositions();
}
/**
* Low level api. Returns a token stream or null if no offset info available
* in index. This can be used to feed the highlighter with a pre-parsed token
* stream
* Low level api. Returns a token stream generated from a {@link Terms}. This
* can be used to feed the highlighter with a pre-parsed token
* stream. The {@link Terms} must have offsets available.
*
* In my tests the speeds to recreate 1000 token streams using this method
* are: - with TermVector offset only data stored - 420 milliseconds - with
@ -149,11 +143,18 @@ public class TokenSources {
* @param tokenPositionsGuaranteedContiguous true if the token position
* numbers have no overlaps or gaps. If looking to eek out the last
* drops of performance, set to true. If in doubt, set to false.
*
* @throws IllegalArgumentException if no offsets are available
*/
public static TokenStream getTokenStream(Terms tpv,
boolean tokenPositionsGuaranteedContiguous)
throws IOException {
if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) {
if (!tpv.hasOffsets()) {
throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets");
}
if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) {
return new TokenStreamFromTermPositionVector(tpv);
}
@ -261,24 +262,31 @@ public class TokenSources {
return new StoredTokenStream(tokensInOriginalOrder);
}
public static TokenStream getTokenStream(IndexReader reader, int docId,
String field) throws IOException {
/**
* Returns a {@link TokenStream} with positions and offsets constructed from
* field termvectors. If the field has no termvectors, or positions or offsets
* are not included in the termvector, return null.
* @param reader the {@link IndexReader} to retrieve term vectors from
* @param docId the document to retrieve termvectors for
* @param field the field to retrieve termvectors for
* @return a {@link TokenStream}, or null if positions and offsets are not available
* @throws IOException If there is a low-level I/O error
*/
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
String field) throws IOException {
Fields vectors = reader.getTermVectors(docId);
if (vectors == null) {
throw new IllegalArgumentException(field + " in doc #" + docId
+ "does not have any term position data stored");
return null;
}
Terms vector = vectors.terms(field);
if (vector == null) {
throw new IllegalArgumentException(field + " in doc #" + docId
+ "does not have any term position data stored");
return null;
}
if (!hasPositions(vector)) {
throw new IllegalArgumentException(field + " in doc #" + docId
+ "does not have any term position data stored");
if (!vector.hasPositions() || !vector.hasOffsets()) {
return null;
}
return getTokenStream(vector);

View File

@ -17,8 +17,6 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -42,6 +40,8 @@ import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
// LUCENE-2874
public class TokenSourcesTest extends LuceneTestCase {
private static final String FIELD = "text";
@ -260,4 +260,40 @@ public class TokenSourcesTest extends LuceneTestCase {
}
}
public void testTermVectorWithoutOffsetsThrowsException()
throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorOffsets(false);
customType.setStoreTermVectorPositions(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final TokenStream tokenStream = TokenSources
.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
}
catch (IllegalArgumentException e) {
// expected
}
finally {
indexReader.close();
directory.close();
}
}
}

View File

@ -16,18 +16,6 @@
*/
package org.apache.solr.highlight;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -38,19 +26,16 @@ import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.vectorhighlight.*;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
@ -62,6 +47,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
/**
*
* @since solr 1.3
@ -462,14 +451,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
List<TextFragment> frags = new ArrayList<TextFragment>();
TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
try {
TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
if (tvStream != null) {
tots = new TermOffsetsTokenStream(tvStream);
}
}
catch (IllegalArgumentException e) {
// No problem. But we can't use TermOffsets optimization.
TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
if (tvStream != null) {
tots = new TermOffsetsTokenStream(tvStream);
}
for (int j = 0; j < docTexts.length; j++) {

View File

@ -649,6 +649,8 @@
termVectors="true" termPositions="true" termOffsets="true"/>
<dynamicField name="tv_mv_*" type="text" indexed="true" stored="true" multiValued="true"
termVectors="true" termPositions="true" termOffsets="true"/>
<dynamicField name="tv_no_off_*" type="text" indexed="true" stored="true"
termVectors="true" termPositions="true"/>
<dynamicField name="*_p" type="xyd" indexed="true" stored="true" multiValued="false"/>

View File

@ -150,6 +150,25 @@ public class HighlighterTest extends SolrTestCaseJ4 {
"//arr[@name='tv_text']/str[.=' <em>long</em> fragments.']"
);
}
@Test
public void testTermVectorWithoutOffsetsHighlight() {
HashMap<String,String> args = new HashMap<String, String>();
args.put("hl", "true");
args.put("hl.fl", "tv_no_off_text");
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory("standard", 0, 200, args);
assertU(adoc("tv_no_off_text", "Crackerjack Cameron", "id", "1"));
assertU(commit());
assertU(optimize());
assertQ("Fields with term vectors switched on but no offsets should be correctly highlighted",
sumLRF.makeRequest("tv_no_off_text:cameron"),
"//arr[@name='tv_no_off_text']/str[.='Crackerjack <em>Cameron</em>']");
}
@Test
public void testTermOffsetsTokenStream() throws Exception {