LUCENE-4479: Fix highlighter on index w/ positions but no offsets

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400504 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 21:39:25 +00:00 · 2012-10-20 19:36:02 +00:00 · 2012-10-20 19:36:02 +00:00 · 61d9cc7c99
commit 61d9cc7c99
parent 7e45db75e3
6 changed files with 108 additions and 52 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -49,6 +49,10 @@ API Changes
 * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
  no longer seek when writing.  (Adrien Grand, Robert Muir)

+* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String)
+  to TokenStream.getTokenStreamWithOffsets, and return null on failure
+  rather than throwing IllegalArgumentException.  (Alan Woodward)
+
 * LUCENE-4472: MergePolicy now accepts a MergeTrigger that provides 
  information about the trigger of the merge ie. merge triggered due
  to a segment merge or a full flush etc. (Simon Willnauer)
@ -67,6 +71,9 @@ Bug Fixes
 * LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
  these counts now all exclude deleted documents.  (Mike McCandless)

+* LUCENE-4479: Highlighter works correctly for fields with term vector
+  positions, but no offsets.  (Alan Woodward)
+
 Optimizations

 * LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets 
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
@ -20,28 +20,26 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */

-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Comparator;
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.StoredDocument;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;

+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Comparator;
+
 /**
 * Hides implementation issues associated with obtaining a TokenStream for use
 * with the higlighter - can obtain from TermFreqVectors with offsets and
@ -121,14 +119,10 @@ public class TokenSources {
    return getTokenStream(vector, false);
  }

-  private static boolean hasPositions(Terms vector) throws IOException {
-    return vector.hasPositions();
-  }
-
  /**
-   * Low level api. Returns a token stream or null if no offset info available
-   * in index. This can be used to feed the highlighter with a pre-parsed token
-   * stream
+   * Low level api. Returns a token stream generated from a {@link Terms}. This
+   * can be used to feed the highlighter with a pre-parsed token
+   * stream.  The {@link Terms} must have offsets available.
   * 
   * In my tests the speeds to recreate 1000 token streams using this method
   * are: - with TermVector offset only data stored - 420 milliseconds - with
@ -149,11 +143,18 @@ public class TokenSources {
   * @param tokenPositionsGuaranteedContiguous true if the token position
   *        numbers have no overlaps or gaps. If looking to eek out the last
   *        drops of performance, set to true. If in doubt, set to false.
+   *
+   * @throws IllegalArgumentException if no offsets are available
   */
  public static TokenStream getTokenStream(Terms tpv,
      boolean tokenPositionsGuaranteedContiguous) 
  throws IOException {
-    if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) {
+
+    if (!tpv.hasOffsets()) {
+      throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets");
+    }
+
+    if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) {
      return new TokenStreamFromTermPositionVector(tpv);
    }

@ -261,24 +262,31 @@ public class TokenSources {
    return new StoredTokenStream(tokensInOriginalOrder);
  }

-  public static TokenStream getTokenStream(IndexReader reader, int docId,
-      String field) throws IOException {
+  /**
+   * Returns a {@link TokenStream} with positions and offsets constructed from
+   * field termvectors.  If the field has no termvectors, or positions or offsets
+   * are not included in the termvector, return null.
+   * @param reader the {@link IndexReader} to retrieve term vectors from
+   * @param docId the document to retrieve termvectors for
+   * @param field the field to retrieve termvectors for
+   * @return a {@link TokenStream}, or null if positions and offsets are not available
+   * @throws IOException If there is a low-level I/O error
+   */
+  public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
+                                                      String field) throws IOException {

    Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
-      throw new IllegalArgumentException(field + " in doc #" + docId
-          + "does not have any term position data stored");
+      return null;
    }

    Terms vector = vectors.terms(field);
    if (vector == null) {
-      throw new IllegalArgumentException(field + " in doc #" + docId
-          + "does not have any term position data stored");
+      return null;
    }

-    if (!hasPositions(vector)) {
-      throw new IllegalArgumentException(field + " in doc #" + docId
-          + "does not have any term position data stored");
+    if (!vector.hasPositions() || !vector.hasOffsets()) {
+      return null;
    }
    
    return getTokenStream(vector);
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
@ -17,8 +17,6 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */

-import java.io.IOException;
-
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -42,6 +40,8 @@ import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;

+import java.io.IOException;
+
 // LUCENE-2874
 public class TokenSourcesTest extends LuceneTestCase {
  private static final String FIELD = "text";
@ -260,4 +260,40 @@ public class TokenSourcesTest extends LuceneTestCase {
    }
  }

+  public void testTermVectorWithoutOffsetsThrowsException()
+      throws IOException, InvalidTokenOffsetsException {
+    final String TEXT = "the fox did not jump";
+    final Directory directory = newDirectory();
+    final IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, null));
+    try {
+      final Document document = new Document();
+      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
+      customType.setStoreTermVectors(true);
+      customType.setStoreTermVectorOffsets(false);
+      customType.setStoreTermVectorPositions(true);
+      document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
+      indexWriter.addDocument(document);
+    } finally {
+      indexWriter.close();
+    }
+    final IndexReader indexReader = DirectoryReader.open(directory);
+    try {
+      assertEquals(1, indexReader.numDocs());
+      final TokenStream tokenStream = TokenSources
+          .getTokenStream(
+              indexReader.getTermVector(0, FIELD),
+              false);
+      fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
+    }
+    catch (IllegalArgumentException e) {
+      // expected
+    }
+    finally {
+      indexReader.close();
+      directory.close();
+    }
+  }
+
+
 }
--- a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
@ -16,18 +16,6 @@
 */
 package org.apache.solr.highlight;

-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.ListIterator;
-import java.util.Set;
-
-
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@ -38,19 +26,16 @@ import org.apache.lucene.index.StorableField;
 import org.apache.lucene.index.StoredDocument;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.highlight.*;
-import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
-import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
-import org.apache.lucene.search.vectorhighlight.FieldQuery;
-import org.apache.lucene.search.vectorhighlight.FragListBuilder;
-import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
+import org.apache.lucene.search.highlight.Formatter;
+import org.apache.lucene.search.vectorhighlight.*;
 import org.apache.lucene.util.AttributeSource.State;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.HighlightParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
-import org.apache.solr.core.SolrConfig;
 import org.apache.solr.core.PluginInfo;
+import org.apache.solr.core.SolrConfig;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.schema.IndexSchema;
@ -62,6 +47,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
 /**
 * 
 * @since solr 1.3
@ -462,14 +451,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
-    try {
-        TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
-        if (tvStream != null) {
-          tots = new TermOffsetsTokenStream(tvStream);
-        }
-    }
-    catch (IllegalArgumentException e) {
-      // No problem. But we can't use TermOffsets optimization.
+    TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
+    if (tvStream != null) {
+      tots = new TermOffsetsTokenStream(tvStream);
    }

    for (int j = 0; j < docTexts.length; j++) {
--- a/solr/core/src/test-files/solr/collection1/conf/schema.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml
@ -649,6 +649,8 @@
      termVectors="true" termPositions="true" termOffsets="true"/>
   <dynamicField name="tv_mv_*"  type="text" indexed="true"  stored="true" multiValued="true"
      termVectors="true" termPositions="true" termOffsets="true"/>
+   <dynamicField name="tv_no_off_*" type="text" indexed="true" stored="true"
+      termVectors="true" termPositions="true"/>

   <dynamicField name="*_p"  type="xyd" indexed="true"  stored="true" multiValued="false"/>

--- a/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java
+++ b/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java
@ -150,6 +150,25 @@ public class HighlighterTest extends SolrTestCaseJ4 {
            "//arr[@name='tv_text']/str[.=' <em>long</em> fragments.']"
            );
  }
+
+  @Test
+  public void testTermVectorWithoutOffsetsHighlight() {
+
+    HashMap<String,String> args = new HashMap<String, String>();
+    args.put("hl", "true");
+    args.put("hl.fl", "tv_no_off_text");
+
+    TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory("standard", 0, 200, args);
+
+    assertU(adoc("tv_no_off_text", "Crackerjack Cameron", "id", "1"));
+    assertU(commit());
+    assertU(optimize());
+
+    assertQ("Fields with term vectors switched on but no offsets should be correctly highlighted",
+            sumLRF.makeRequest("tv_no_off_text:cameron"),
+            "//arr[@name='tv_no_off_text']/str[.='Crackerjack <em>Cameron</em>']");
+
+  }
  
  @Test
  public void testTermOffsetsTokenStream() throws Exception {