mirror of https://github.com/apache/lucene.git
LUCENE-4479: Fix highlighter on index w/ positions but no offsets
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400504 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7e45db75e3
commit
61d9cc7c99
|
@ -49,6 +49,10 @@ API Changes
|
|||
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
||||
no longer seek when writing. (Adrien Grand, Robert Muir)
|
||||
|
||||
* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String)
|
||||
to TokenStream.getTokenStreamWithOffsets, and return null on failure
|
||||
rather than throwing IllegalArgumentException. (Alan Woodward)
|
||||
|
||||
* LUCENE-4472: MergePolicy now accepts a MergeTrigger that provides
|
||||
information about the trigger of the merge ie. merge triggered due
|
||||
to a segment merge or a full flush etc. (Simon Willnauer)
|
||||
|
@ -67,6 +71,9 @@ Bug Fixes
|
|||
* LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
|
||||
these counts now all exclude deleted documents. (Mike McCandless)
|
||||
|
||||
* LUCENE-4479: Highlighter works correctly for fields with term vector
|
||||
positions, but no offsets. (Alan Woodward)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets
|
||||
|
|
|
@ -20,28 +20,26 @@ package org.apache.lucene.search.highlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Hides implementation issues associated with obtaining a TokenStream for use
|
||||
* with the higlighter - can obtain from TermFreqVectors with offsets and
|
||||
|
@ -121,14 +119,10 @@ public class TokenSources {
|
|||
return getTokenStream(vector, false);
|
||||
}
|
||||
|
||||
private static boolean hasPositions(Terms vector) throws IOException {
|
||||
return vector.hasPositions();
|
||||
}
|
||||
|
||||
/**
|
||||
* Low level api. Returns a token stream or null if no offset info available
|
||||
* in index. This can be used to feed the highlighter with a pre-parsed token
|
||||
* stream
|
||||
* Low level api. Returns a token stream generated from a {@link Terms}. This
|
||||
* can be used to feed the highlighter with a pre-parsed token
|
||||
* stream. The {@link Terms} must have offsets available.
|
||||
*
|
||||
* In my tests the speeds to recreate 1000 token streams using this method
|
||||
* are: - with TermVector offset only data stored - 420 milliseconds - with
|
||||
|
@ -149,11 +143,18 @@ public class TokenSources {
|
|||
* @param tokenPositionsGuaranteedContiguous true if the token position
|
||||
* numbers have no overlaps or gaps. If looking to eek out the last
|
||||
* drops of performance, set to true. If in doubt, set to false.
|
||||
*
|
||||
* @throws IllegalArgumentException if no offsets are available
|
||||
*/
|
||||
public static TokenStream getTokenStream(Terms tpv,
|
||||
boolean tokenPositionsGuaranteedContiguous)
|
||||
throws IOException {
|
||||
if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) {
|
||||
|
||||
if (!tpv.hasOffsets()) {
|
||||
throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets");
|
||||
}
|
||||
|
||||
if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) {
|
||||
return new TokenStreamFromTermPositionVector(tpv);
|
||||
}
|
||||
|
||||
|
@ -261,24 +262,31 @@ public class TokenSources {
|
|||
return new StoredTokenStream(tokensInOriginalOrder);
|
||||
}
|
||||
|
||||
public static TokenStream getTokenStream(IndexReader reader, int docId,
|
||||
String field) throws IOException {
|
||||
/**
|
||||
* Returns a {@link TokenStream} with positions and offsets constructed from
|
||||
* field termvectors. If the field has no termvectors, or positions or offsets
|
||||
* are not included in the termvector, return null.
|
||||
* @param reader the {@link IndexReader} to retrieve term vectors from
|
||||
* @param docId the document to retrieve termvectors for
|
||||
* @param field the field to retrieve termvectors for
|
||||
* @return a {@link TokenStream}, or null if positions and offsets are not available
|
||||
* @throws IOException If there is a low-level I/O error
|
||||
*/
|
||||
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
|
||||
String field) throws IOException {
|
||||
|
||||
Fields vectors = reader.getTermVectors(docId);
|
||||
if (vectors == null) {
|
||||
throw new IllegalArgumentException(field + " in doc #" + docId
|
||||
+ "does not have any term position data stored");
|
||||
return null;
|
||||
}
|
||||
|
||||
Terms vector = vectors.terms(field);
|
||||
if (vector == null) {
|
||||
throw new IllegalArgumentException(field + " in doc #" + docId
|
||||
+ "does not have any term position data stored");
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!hasPositions(vector)) {
|
||||
throw new IllegalArgumentException(field + " in doc #" + docId
|
||||
+ "does not have any term position data stored");
|
||||
if (!vector.hasPositions() || !vector.hasOffsets()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return getTokenStream(vector);
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.search.highlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -42,6 +40,8 @@ import org.apache.lucene.search.spans.SpanTermQuery;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
// LUCENE-2874
|
||||
public class TokenSourcesTest extends LuceneTestCase {
|
||||
private static final String FIELD = "text";
|
||||
|
@ -260,4 +260,40 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testTermVectorWithoutOffsetsThrowsException()
|
||||
throws IOException, InvalidTokenOffsetsException {
|
||||
final String TEXT = "the fox did not jump";
|
||||
final Directory directory = newDirectory();
|
||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
|
||||
try {
|
||||
final Document document = new Document();
|
||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
customType.setStoreTermVectors(true);
|
||||
customType.setStoreTermVectorOffsets(false);
|
||||
customType.setStoreTermVectorPositions(true);
|
||||
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
||||
indexWriter.addDocument(document);
|
||||
} finally {
|
||||
indexWriter.close();
|
||||
}
|
||||
final IndexReader indexReader = DirectoryReader.open(directory);
|
||||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
// expected
|
||||
}
|
||||
finally {
|
||||
indexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -16,18 +16,6 @@
|
|||
*/
|
||||
package org.apache.solr.highlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -38,19 +26,16 @@ import org.apache.lucene.index.StorableField;
|
|||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.*;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
||||
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
|
||||
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
|
||||
import org.apache.lucene.search.highlight.Formatter;
|
||||
import org.apache.lucene.search.vectorhighlight.*;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.core.PluginInfo;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
@ -62,6 +47,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* @since solr 1.3
|
||||
|
@ -462,14 +451,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
|||
List<TextFragment> frags = new ArrayList<TextFragment>();
|
||||
|
||||
TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
|
||||
try {
|
||||
TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
|
||||
if (tvStream != null) {
|
||||
tots = new TermOffsetsTokenStream(tvStream);
|
||||
}
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
// No problem. But we can't use TermOffsets optimization.
|
||||
TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
|
||||
if (tvStream != null) {
|
||||
tots = new TermOffsetsTokenStream(tvStream);
|
||||
}
|
||||
|
||||
for (int j = 0; j < docTexts.length; j++) {
|
||||
|
|
|
@ -649,6 +649,8 @@
|
|||
termVectors="true" termPositions="true" termOffsets="true"/>
|
||||
<dynamicField name="tv_mv_*" type="text" indexed="true" stored="true" multiValued="true"
|
||||
termVectors="true" termPositions="true" termOffsets="true"/>
|
||||
<dynamicField name="tv_no_off_*" type="text" indexed="true" stored="true"
|
||||
termVectors="true" termPositions="true"/>
|
||||
|
||||
<dynamicField name="*_p" type="xyd" indexed="true" stored="true" multiValued="false"/>
|
||||
|
||||
|
|
|
@ -150,6 +150,25 @@ public class HighlighterTest extends SolrTestCaseJ4 {
|
|||
"//arr[@name='tv_text']/str[.=' <em>long</em> fragments.']"
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTermVectorWithoutOffsetsHighlight() {
|
||||
|
||||
HashMap<String,String> args = new HashMap<String, String>();
|
||||
args.put("hl", "true");
|
||||
args.put("hl.fl", "tv_no_off_text");
|
||||
|
||||
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory("standard", 0, 200, args);
|
||||
|
||||
assertU(adoc("tv_no_off_text", "Crackerjack Cameron", "id", "1"));
|
||||
assertU(commit());
|
||||
assertU(optimize());
|
||||
|
||||
assertQ("Fields with term vectors switched on but no offsets should be correctly highlighted",
|
||||
sumLRF.makeRequest("tv_no_off_text:cameron"),
|
||||
"//arr[@name='tv_no_off_text']/str[.='Crackerjack <em>Cameron</em>']");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTermOffsetsTokenStream() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue