SOLR-452 commit: hl.mergeContiguous

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@610191 13f79535-47bb-0310-9956-ffa450edef68
2008-01-08 22:02:46 +00:00 · 2008-01-08 22:02:46 +00:00 · 06b3f0eed0
parent a405feac80
commit 06b3f0eed0
4 changed files with 84 additions and 18 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -87,9 +87,11 @@ New Features
 13. SOLR-225: Enable pluggable highlighting classes.  Allow configurable
    highlighting formatters and Fragmenters.  (ryan)

-14. SOLR-273/376: Added hl.maxAnalyzedChars highlighting parameter, defaulting to
-    50k.  Also add hl.alternateField, which allows the specification of a backup
-    field to use as summary if no keywords are matched. (klaas)
+14. SOLR-273/376/452: Added hl.maxAnalyzedChars highlighting parameter, defaulting 
+    to 50k, hl.alternateField, which allows the specification of a backup
+    field to use as summary if no keywords are matched, and hl.mergeContiguous,
+    which combines fragments if they are adjacent in the source document.
+    (klaas, Grant Ingersoll via klaas)

 15. SOLR-291: Control maximum number of documents to cache for any entry
    in the queryResultCache via queryResultMaxDocsCached solrconfig.xml 
--- a/src/java/org/apache/solr/common/params/HighlightParams.java
+++ b/src/java/org/apache/solr/common/params/HighlightParams.java
@ -33,6 +33,7 @@ public interface HighlightParams {
  public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
  public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";

+  public static final String MERGE_CONTIGUOUS_FRAGMENTS = HIGHLIGHT + ".mergeContiguous";
  // Formatter
  public static final String SIMPLE = "simple";
  public static final String SIMPLE_PRE  = HIGHLIGHT+"."+SIMPLE+".pre";
--- a/src/java/org/apache/solr/highlight/SolrHighlighter.java
+++ b/src/java/org/apache/solr/highlight/SolrHighlighter.java
@ -187,6 +187,15 @@ public class SolrHighlighter
  protected int getMaxSnippets(String fieldName, SolrParams params) {
     return params.getFieldInt(fieldName, HighlightParams.SNIPPETS,1);
  }
+
+  /**
+   * Return whether adjacent fragments should be merged.
+   * @param fieldName The name of the field
+   * @param params The params controlling Highlighting
+   */
+  protected boolean isMergeContiguousFragments(String fieldName, SolrParams params){
+    return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false);
+  }
  
  /**
   * Return a formatter appropriate for this field. If a formatter
@ -260,20 +269,22 @@ public class SolrHighlighter
       searcher.readDocs(readDocs, docs, fset);
     }

-     // Highlight each document
-     DocIterator iterator = docs.iterator();
-     for (int i = 0; i < docs.size(); i++) {
-        int docId = iterator.nextDoc();
-        Document doc = readDocs[i];
-        NamedList docSummaries = new SimpleOrderedMap();
-        for (String fieldName : fieldNames) {
-           fieldName = fieldName.trim();
-           String[] docTexts = doc.getValues(fieldName);
-           if (docTexts == null) continue;

-           // get highlighter, and number of fragments for this field
-           Highlighter highlighter = getHighlighter(query, fieldName, req);
-           int numFragments = getMaxSnippets(fieldName, params);
+    // Highlight each document
+    DocIterator iterator = docs.iterator();
+    for (int i = 0; i < docs.size(); i++) {
+       int docId = iterator.nextDoc();
+       Document doc = readDocs[i];
+       NamedList docSummaries = new SimpleOrderedMap();
+       for (String fieldName : fieldNames) {
+          fieldName = fieldName.trim();
+          String[] docTexts = doc.getValues(fieldName);
+          if (docTexts == null) continue;
+
+          // get highlighter, and number of fragments for this field
+          Highlighter highlighter = getHighlighter(query, fieldName, req);
+          int numFragments = getMaxSnippets(fieldName, params);
+          boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

           String[] summaries = null;
           TextFragment[] frag;
@ -288,7 +299,7 @@ public class SolrHighlighter
                 // fall back to analyzer
                 tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
              }
-              frag = highlighter.getBestTextFragments(tstream, docTexts[0], false, numFragments);
+              frag = highlighter.getBestTextFragments(tstream, docTexts[0], mergeContiguousFragments, numFragments);
           }
           else {
              // multi-valued field
--- a/src/test/org/apache/solr/highlight/HighlighterTest.java
+++ b/src/test/org/apache/solr/highlight/HighlighterTest.java
@ -19,6 +19,8 @@ package org.apache.solr.highlight;

 import org.apache.solr.core.SolrCore;
 import org.apache.solr.util.*;
+import org.apache.solr.common.params.HighlightParams;
+
 import java.util.HashMap;

 /**
@ -27,7 +29,11 @@ import java.util.HashMap;
 */
 public class HighlighterTest extends AbstractSolrTestCase {

-  private static String LONG_TEXT = "a long days night this should be a piece of text which is is is is is is is is is is is is is is is is is is is is is is is is isis is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is sufficiently lengthly to produce multiple fragments which are not concatenated at all--we want two disjoint long fragments.";
+  private static String LONG_TEXT = "a long days night this should be a piece of text which is is is is is is is is is is is is is is is is is is is " +
+          "is is is is is isis is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is " +
+          "is is is is is is is is is is is is is " +
+          "is is is is is is is is is is is is is is is is is is is is sufficiently lengthly to produce multiple fragments which are not concatenated " +
+          "at all--we want two disjoint long fragments.";

  @Override public String getSchemaFile() { return "schema.xml"; }
  @Override public String getSolrConfigFile() { return "solrconfig.xml"; }
@ -67,6 +73,52 @@ public class HighlighterTest extends AbstractSolrTestCase {
    assertTrue( regex instanceof RegexFragmenter );
  }

+  public void testMergeContiguous() throws Exception {
+    HashMap<String,String> args = new HashMap<String,String>();
+    args.put(HighlightParams.HIGHLIGHT, "true");
+    args.put("df", "t_text");
+    args.put(HighlightParams.FIELDS, "");
+    args.put(HighlightParams.SNIPPETS, String.valueOf(4));
+    args.put(HighlightParams.FRAGSIZE, String.valueOf(40));
+    args.put(HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, "true");
+    TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
+      "standard", 0, 200, args);
+    String input = "this is some long text.  It has the word long in many places.  In fact, it has long on some different fragments.  " +
+            "Let us see what happens to long in this case.";
+    String gold = "this is some <em>long</em> text.  It has the word <em>long</em> in many places.  In fact, it has <em>long</em> on some different fragments.  " +
+            "Let us see what happens to <em>long</em> in this case.";
+    assertU(adoc("t_text", input, "id", "1"));
+    assertU(commit());
+    assertU(optimize());
+    assertQ("Merge Contiguous",
+            sumLRF.makeRequest("t_text:long"),
+            "//lst[@name='highlighting']/lst[@name='1']",
+            "//lst[@name='1']/arr[@name='t_text']/str[.='" + gold + "']"
+            );
+    args.put("f.t_text." + HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, "true");
+    assertU(adoc("t_text", input, "id", "1"));
+    assertU(commit());
+    assertU(optimize());
+    assertQ("Merge Contiguous",
+            sumLRF.makeRequest("t_text:long"),
+            "//lst[@name='highlighting']/lst[@name='1']",
+            "//lst[@name='1']/arr[@name='t_text']/str[.='" + gold + "']"
+            );
+
+    args.put(HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, "false");
+    args.put("f.t_text." + HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, "false");
+    sumLRF = h.getRequestFactory(
+      "standard", 0, 200, args);
+    assertQ("Merge Contiguous",
+            sumLRF.makeRequest("t_text:long"),
+            "//lst[@name='highlighting']/lst[@name='1']",
+            "//lst[@name='1']/arr[@name='t_text']/str[.='this is some <em>long</em> text.  It has']",
+            "//lst[@name='1']/arr[@name='t_text']/str[.=' the word <em>long</em> in many places.  In fact, it has']",
+            "//lst[@name='1']/arr[@name='t_text']/str[.=' <em>long</em> on some different fragments.  Let us']",
+            "//lst[@name='1']/arr[@name='t_text']/str[.=' see what happens to <em>long</em> in this case.']"
+            );
+  }
+
  public void testTermVecHighlight() {

    // do summarization using term vectors