LUCENE-4860: per-field control over scoring and formatting

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1459816 13f79535-47bb-0310-9956-ffa450edef68
2025-02-21 01:18:45 +00:00 · 2013-03-22 14:50:11 +00:00 · 2013-03-22 14:50:11 +00:00 · 0e830fbae4
commit 0e830fbae4
parent 97ec69bb69
6 changed files with 115 additions and 49 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -130,6 +130,12 @@ New Features
 * LUCENE-4752: New SortingMergePolicy (in lucene/misc) that sorts documents
  before merging segments. (Adrien Grand, Shai Erera, David Smiley)

+* LUCENE-4860: Customize scoring and formatting per-field in
+  PosthingsHighlighter by subclassing and overriding the getFormatter
+  and/or getScorer methods.  This also changes Passage.getMatchTerms()
+  to return BytesRef[] instead of Term[].  (Robert Muir, Mike
+  McCandless)
+
 API Changes

 * LUCENE-4844: removed TaxonomyReader.getParent(), you should use
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
@ -17,8 +17,8 @@ package org.apache.lucene.search.postingshighlight;
 * limitations under the License.
 */

-import org.apache.lucene.index.Term;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.SorterTemplate;

@ -36,15 +36,15 @@ public final class Passage {

  int matchStarts[] = new int[8];
  int matchEnds[] = new int[8];
-  Term matchTerms[] = new Term[8];
+  BytesRef matchTerms[] = new BytesRef[8];
  int numMatches = 0;
  
-  void addMatch(int startOffset, int endOffset, Term term) {
+  void addMatch(int startOffset, int endOffset, BytesRef term) {
    assert startOffset >= this.startOffset && startOffset <= this.endOffset;
    if (numMatches == matchStarts.length) {
      matchStarts = ArrayUtil.grow(matchStarts, numMatches+1);
      matchEnds = ArrayUtil.grow(matchEnds, numMatches+1);
-      Term newMatchTerms[] = new Term[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+      BytesRef newMatchTerms[] = new BytesRef[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
      matchTerms = newMatchTerms;
    }
@ -57,7 +57,7 @@ public final class Passage {
  void sort() {
    final int starts[] = matchStarts;
    final int ends[] = matchEnds;
-    final Term terms[] = matchTerms;
+    final BytesRef terms[] = matchTerms;
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
@ -69,7 +69,7 @@ public final class Passage {
        ends[i] = ends[j];
        ends[j] = temp;
        
-        Term tempTerm = terms[i];
+        BytesRef tempTerm = terms[i];
        terms[i] = terms[j];
        terms[j] = tempTerm;
      }
@ -155,11 +155,11 @@ public final class Passage {
  }

  /**
-   * Term of the matches, corresponding with {@link #getMatchStarts()}.
+   * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
   * <p>
   * Only {@link #getNumMatches()} are valid.
   */
-  public Term[] getMatchTerms() {
+  public BytesRef[] getMatchTerms() {
    return matchTerms;
  }
 }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@ -97,8 +97,14 @@ public class PostingsHighlighter {
    
  private final int maxLength;
  private final BreakIterator breakIterator;
-  private final PassageScorer scorer;
-  private final PassageFormatter formatter;
+
+  /** Set the first time {@link #getFormatter} is called,
+   *  and then reused. */
+  private PassageFormatter defaultFormatter;
+
+  /** Set the first time {@link #getScorer} is called,
+   *  and then reused. */
+  private PassageScorer defaultScorer;
  
  /**
   * Creates a new highlighter with default parameters.
@ -113,7 +119,7 @@ public class PostingsHighlighter {
   * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
   */
  public PostingsHighlighter(int maxLength) {
-    this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
+    this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT));
  }
  
  /**
@ -122,11 +128,9 @@ public class PostingsHighlighter {
   * @param breakIterator used for finding passage
   *        boundaries; pass null to highlight the entire
   *        content as a single Passage.
-   * @param scorer used for ranking passages.
-   * @param formatter used for formatting passages into highlighted snippets.
   * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
   */
-  public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
+  public PostingsHighlighter(int maxLength, BreakIterator breakIterator) {
    if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
      // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
      // our sentinel in the offsets queue uses this value to terminate.
@ -135,13 +139,30 @@ public class PostingsHighlighter {
    if (breakIterator == null) {
      breakIterator = new WholeBreakIterator();
    }
-    if (scorer == null || formatter == null) {
-      throw new NullPointerException();
-    }
    this.maxLength = maxLength;
    this.breakIterator = breakIterator;
-    this.scorer = scorer;
-    this.formatter = formatter;
+  }
+
+  /** Returns the {@link PassageFormatter} to use for
+   *  formatting passages into highlighted snippets.  This
+   *  returns a new {@code PassageFormatter} by default;
+   *  subclasses can override to customize. */
+  protected PassageFormatter getFormatter(String field) {
+    if (defaultFormatter == null) {
+      defaultFormatter = new PassageFormatter();
+    }
+    return defaultFormatter;
+  }
+
+  /** Returns the {@link PassageScorer} to use for
+   *  ranking passages.  This
+   *  returns a new {@code PassageScorer} by default;
+   *  subclasses can override to customize. */
+  protected PassageScorer getScorer(String field) {
+    if (defaultScorer == null) {
+      defaultScorer = new PassageScorer();
+    }
+    return defaultScorer;
  }

  /**
@ -302,7 +323,13 @@ public class PostingsHighlighter {
      Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
      SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
      // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
-      Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]);
+
+      // Strip off the redundant field:
+      BytesRef terms[] = new BytesRef[fieldTerms.size()];
+      int termUpto = 0;
+      for(Term term : fieldTerms) {
+        terms[termUpto++] = term.bytes();
+      }
      Map<Integer,String> fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages);
        
      String[] result = new String[docids.length];
@ -333,7 +360,7 @@ public class PostingsHighlighter {
    return contents;
  }
    
-  private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {  
+  private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {  
    Map<Integer,String> highlights = new HashMap<Integer,String>();
    
    // reuse in the real sense... for docs in same segment we just advance our old enum
@ -341,6 +368,11 @@ public class PostingsHighlighter {
    TermsEnum termsEnum = null;
    int lastLeaf = -1;

+    PassageFormatter fieldFormatter = getFormatter(field);
+    if (fieldFormatter == null) {
+      throw new NullPointerException("PassageFormatter cannot be null");
+    }
+
    for (int i = 0; i < docids.length; i++) {
      String content = contents[i];
      if (content.length() == 0) {
@ -366,7 +398,7 @@ public class PostingsHighlighter {
      if (passages.length > 0) {
        // otherwise a null snippet (eg if field is missing
        // entirely from the doc)
-        highlights.put(doc, formatter.format(passages, content));
+        highlights.put(doc, fieldFormatter.format(passages, content));
      }
      lastLeaf = leaf;
    }
@ -377,8 +409,12 @@ public class PostingsHighlighter {
  // algorithm: treat sentence snippets as miniature documents
  // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
  // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
-  private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc, 
+  private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, 
      TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
+    PassageScorer scorer = getScorer(field);
+    if (scorer == null) {
+      throw new NullPointerException("PassageScorer cannot be null");
+    }
    PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
    float weights[] = new float[terms.length];
    // initialize postings
@ -389,7 +425,7 @@ public class PostingsHighlighter {
        continue;
      } else if (de == null) {
        postings[i] = EMPTY; // initially
-        if (!termsEnum.seekExact(terms[i].bytes(), true)) {
+        if (!termsEnum.seekExact(terms[i], true)) {
          continue; // term not found
        }
        de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@ -457,7 +457,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    iw.close();
    
    IndexSearcher searcher = newSearcher(ir);
-    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null);
    Query query = new TermQuery(new Term("body", "test"));
    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
    assertEquals(1, topDocs.totalHits);
@ -527,7 +527,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    
    IndexSearcher searcher = newSearcher(ir);

-    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()) {
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null) {
        @Override
        protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
          assert fields.length == 1;
@ -636,7 +636,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    iw.close();
    
    IndexSearcher searcher = newSearcher(ir);
-    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null);
    Query query = new TermQuery(new Term("body", "highlighting"));
    int[] docIDs = new int[] {0};
    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
@ -112,16 +112,26 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
  
  private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
    for (int n = 1; n < maxTopN; n++) {
-      FakePassageFormatter f1 = new FakePassageFormatter();
+      final FakePassageFormatter f1 = new FakePassageFormatter();
      PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1, 
-                                                       BreakIterator.getSentenceInstance(Locale.ROOT), 
-                                                       new PassageScorer(),
-                                                       f1);
-      FakePassageFormatter f2 = new FakePassageFormatter();
+                                                       BreakIterator.getSentenceInstance(Locale.ROOT)) {
+          @Override
+          protected PassageFormatter getFormatter(String field) {
+            assertEquals("body", field);
+            return f1;
+          }
+        };
+
+      final FakePassageFormatter f2 = new FakePassageFormatter();
      PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1, 
-                                                       BreakIterator.getSentenceInstance(Locale.ROOT), 
-                                                       new PassageScorer(),
-                                                       f2);
+                                                       BreakIterator.getSentenceInstance(Locale.ROOT)) {
+          @Override
+          protected PassageFormatter getFormatter(String field) {
+            assertEquals("body", field);
+            return f2;
+          }
+        };
+
      BooleanQuery bq = new BooleanQuery(false);
      bq.add(query, BooleanClause.Occur.MUST);
      bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
@ -170,8 +180,7 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
        // we use a very simple analyzer. so we can assert the matches are correct
        int lastMatchStart = -1;
        for (int i = 0; i < p.getNumMatches(); i++) {
-          Term term = p.getMatchTerms()[i];
-          assertEquals("body", term.field());
+          BytesRef term = p.getMatchTerms()[i];
          int matchStart = p.getMatchStarts()[i];
          assertTrue(matchStart >= 0);
          // must at least start within the passage
@ -184,9 +193,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
          // single character terms
          assertEquals(matchStart+1, matchEnd);
          // and the offsets must be correct...
-          BytesRef bytes = term.bytes();
-          assertEquals(1, bytes.length);
-          assertEquals((char)bytes.bytes[bytes.offset], Character.toLowerCase(content.charAt(matchStart)));
+          assertEquals(1, term.length);
+          assertEquals((char)term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart)));
        }
        // record just the start/end offset for simplicity
        seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
@ -262,9 +270,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
    
    IndexSearcher searcher = newSearcher(ir);
    PostingsHighlighter highlighter = new PostingsHighlighter(10000, 
-                                             BreakIterator.getSentenceInstance(Locale.ROOT), 
-                                             new PassageScorer(1.2f, 0, 87), 
-                                             new PassageFormatter());
+                                                              BreakIterator.getSentenceInstance(Locale.ROOT)) {
+        @Override
+        protected PassageScorer getScorer(String field) {
+          return new PassageScorer(1.2f, 0, 87);
+        }
+      };
    Query query = new TermQuery(new Term("body", "test"));
    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
    assertEquals(1, topDocs.totalHits);
@ -299,9 +310,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
    
    IndexSearcher searcher = newSearcher(ir);
    PostingsHighlighter highlighter = new PostingsHighlighter(10000, 
-                                             BreakIterator.getSentenceInstance(Locale.ROOT), 
-                                             new PassageScorer(0, 0.75f, 87), 
-                                             new PassageFormatter());
+                                                              BreakIterator.getSentenceInstance(Locale.ROOT)) {
+        @Override
+        protected PassageScorer getScorer(String field) {
+          return new PassageScorer(0, 0.75f, 87);
+        }
+      };
    BooleanQuery query = new BooleanQuery();
    query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
    query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
--- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
@ -97,7 +97,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
    if (pivot == null) {
      pivot = "87";
    }
-    PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
+    final PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
    
    // formatter parameters: preTag/postTag/ellipsis
    String preTag = attributes.get("preTag");
@ -112,7 +112,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
    if (ellipsis == null) {
      ellipsis = "... ";
    }
-    PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
+    final PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);

    String summarizeEmpty = attributes.get("summarizeEmpty");
    final boolean summarizeEmptyBoolean;
@ -127,7 +127,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
    if (attributes.containsKey("maxLength")) {
      maxLength = Integer.parseInt(attributes.get("maxLength"));
    }
-    highlighter = new PostingsHighlighter(maxLength, breakIterator, scorer, formatter) {
+    highlighter = new PostingsHighlighter(maxLength, breakIterator) {
        @Override
        protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
          if (summarizeEmptyBoolean) {
@ -136,6 +136,16 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
            return new Passage[0];
          }
        }
+
+        @Override
+        protected PassageFormatter getFormatter(String fieldName) {
+          return formatter;
+        }
+
+        @Override
+        protected PassageScorer getScorer(String fieldName) {
+          return scorer;
+        }
      };
  }