LUCENE-3665: Make WeightedSpanTermExtractor extensible to handle custom query implemenations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1226417 13f79535-47bb-0310-9956-ffa450edef68
2012-01-02 13:49:23 +00:00 · 2012-01-02 13:49:23 +00:00 · 33a3c50b02
parent d5932e1149
commit 33a3c50b02
5 changed files with 239 additions and 20 deletions
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/PositionSpan.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/PositionSpan.java
@ -0,0 +1,31 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Utility class to record Positions Spans
+ * @lucene.internal
+ */
+public class PositionSpan {
+  int start;
+  int end;
+
+  public PositionSpan(int start, int end) {
+    this.start = start;
+    this.end = end;
+  }
+}
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
@ -207,8 +207,7 @@ public class QueryScorer implements Scorer {
  }
  
  private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
-    WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
-        : new WeightedSpanTermExtractor(defaultField);
+    WeightedSpanTermExtractor qse = newTermExtractor(defaultField);
    qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
    qse.setExpandMultiTermQuery(expandMultiTermQuery);
    qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
@ -225,6 +224,11 @@ public class QueryScorer implements Scorer {
    
    return null;
  }
+  
+  protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
+    return defaultField == null ? new WeightedSpanTermExtractor()
+    : new WeightedSpanTermExtractor(defaultField);
+  }

  /*
   * (non-Javadoc)
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java
@ -89,16 +89,7 @@ public class WeightedSpanTerm extends WeightedTerm{
  public List<PositionSpan> getPositionSpans() {
    return positionSpans;
  }
+
 }


-// Utility class to store a Span
-class PositionSpan {
-  int start;
-  int end;
-
-  public PositionSpan(int start, int end) {
-    this.start = start;
-    this.end = end;
-  }
-}
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -90,7 +90,7 @@ public class WeightedSpanTermExtractor {
   *          Map to place created WeightedSpanTerms in
   * @throws IOException
   */
-  private void extract(Query query, Map<String,WeightedSpanTerm> terms) throws IOException {
+  protected void extract(Query query, Map<String,WeightedSpanTerm> terms) throws IOException {
    if (query instanceof BooleanQuery) {
      BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();

@ -206,6 +206,12 @@ public class WeightedSpanTermExtractor {
        extractWeightedSpanTerms(terms, sp);
      }
    }
+    extractUnknownQuery(query, terms);
+  }
+
+  protected void extractUnknownQuery(Query query,
+      Map<String, WeightedSpanTerm> terms) throws IOException {
+    // for sub-classing to extract custom queries
  }

  /**
@ -217,7 +223,7 @@ public class WeightedSpanTermExtractor {
   *          SpanQuery to extract Terms from
   * @throws IOException
   */
-  private void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery) throws IOException {
+  protected void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery) throws IOException {
    Set<String> fieldNames;

    if (fieldName == null) {
@ -305,7 +311,7 @@ public class WeightedSpanTermExtractor {
   *          Query to extract Terms from
   * @throws IOException
   */
-  private void extractWeightedTerms(Map<String,WeightedSpanTerm> terms, Query query) throws IOException {
+  protected void extractWeightedTerms(Map<String,WeightedSpanTerm> terms, Query query) throws IOException {
    Set<Term> nonWeightedTerms = new HashSet<Term>();
    query.extractTerms(nonWeightedTerms);

@ -321,13 +327,13 @@ public class WeightedSpanTermExtractor {
  /**
   * Necessary to implement matches for queries against <code>defaultField</code>
   */
-  private boolean fieldNameComparator(String fieldNameToCheck) {
+  protected boolean fieldNameComparator(String fieldNameToCheck) {
    boolean rv = fieldName == null || fieldName.equals(fieldNameToCheck)
      || (defaultField != null && defaultField.equals(fieldNameToCheck));
    return rv;
  }

-  private AtomicReaderContext getLeafContextForField(String field) throws IOException {
+  protected AtomicReaderContext getLeafContextForField(String field) throws IOException {
    if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
      tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
      cachedTokenStream = true;
@ -449,7 +455,7 @@ public class WeightedSpanTermExtractor {
    return terms;
  }
  
-  private void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
+  protected void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
    if (spanQuery instanceof FieldMaskingSpanQuery) {
      collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
    } else if (spanQuery instanceof SpanFirstQuery) {
@ -469,7 +475,7 @@ public class WeightedSpanTermExtractor {
    }
  }
  
-  private boolean mustRewriteQuery(SpanQuery spanQuery) {
+  protected boolean mustRewriteQuery(SpanQuery spanQuery) {
    if (!expandMultiTermQuery) {
      return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
    } else if (spanQuery instanceof FieldMaskingSpanQuery) {
@ -504,7 +510,8 @@ public class WeightedSpanTermExtractor {
   * This class makes sure that if both position sensitive and insensitive
   * versions of the same term are added, the position insensitive one wins.
   */
-  static private class PositionCheckingMap<K> extends HashMap<K,WeightedSpanTerm> {
+  @SuppressWarnings("serial")
+  protected static class PositionCheckingMap<K> extends HashMap<K,WeightedSpanTerm> {

    @Override
    public void putAll(Map<? extends K,? extends WeightedSpanTerm> m) {
--- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java
+++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java
@ -0,0 +1,186 @@
+package org.apache.lucene.search.highlight.custom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.highlight.Highlighter;
+import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
+import org.apache.lucene.search.highlight.QueryScorer;
+import org.apache.lucene.search.highlight.SimpleFragmenter;
+import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
+import org.apache.lucene.search.highlight.WeightedSpanTerm;
+import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Tests the extensibility of {@link WeightedSpanTermExtractor} and
+ * {@link QueryScorer} in a user defined package
+ */
+public class HighlightCustomQueryTest extends LuceneTestCase {
+
+  private static final String FIELD_NAME = "contents";
+
+  public void testHighlightCustomQuery() throws IOException,
+      InvalidTokenOffsetsException {
+    String s1 = "I call our world Flatland, not because we call it so,";
+
+    // Verify that a query against the default field results in text being
+    // highlighted
+    // regardless of the field name.
+
+    CustomQuery q = new CustomQuery(new Term(FIELD_NAME, "world"));
+
+    String expected = "I call our <B>world</B> Flatland, not because we call it so,";
+    String observed = highlightField(q, "SOME_FIELD_NAME", s1);
+    if (VERBOSE)
+      System.out.println("Expected: \"" + expected + "\n" + "Observed: \""
+          + observed);
+    assertEquals(
+        "Query in the default field results in text for *ANY* field being highlighted",
+        expected, observed);
+
+    // Verify that a query against a named field does not result in any
+    // highlighting
+    // when the query field name differs from the name of the field being
+    // highlighted,
+    // which in this example happens to be the default field name.
+    q = new CustomQuery(new Term("text", "world"));
+
+    expected = s1;
+    observed = highlightField(q, FIELD_NAME, s1);
+    if (VERBOSE)
+      System.out.println("Expected: \"" + expected + "\n" + "Observed: \""
+          + observed);
+    assertEquals(
+        "Query in a named field does not result in highlighting when that field isn't in the query",
+        s1, highlightField(q, FIELD_NAME, s1));
+
+  }
+
+  /**
+   * This method intended for use with
+   * <tt>testHighlightingWithDefaultField()</tt>
+   * 
+   * @throws InvalidTokenOffsetsException
+   */
+  private static String highlightField(Query query, String fieldName,
+      String text) throws IOException, InvalidTokenOffsetsException {
+    TokenStream tokenStream = new MockAnalyzer(random, MockTokenizer.SIMPLE,
+        true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
+        new StringReader(text));
+    // Assuming "<B>", "</B>" used to highlight
+    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
+    MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME);
+    Highlighter highlighter = new Highlighter(formatter, scorer);
+    highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
+
+    String rv = highlighter.getBestFragments(tokenStream, text, 1,
+        "(FIELD TEXT TRUNCATED)");
+    return rv.length() == 0 ? text : rv;
+  }
+
+  public static class MyWeightedSpanTermExtractor extends
+      WeightedSpanTermExtractor {
+
+    public MyWeightedSpanTermExtractor() {
+      super();
+    }
+
+    public MyWeightedSpanTermExtractor(String defaultField) {
+      super(defaultField);
+    }
+
+    @Override
+    protected void extractUnknownQuery(Query query,
+        Map<String, WeightedSpanTerm> terms) throws IOException {
+      if (query instanceof CustomQuery) {
+        extractWeightedTerms(terms, new TermQuery(((CustomQuery) query).term));
+      }
+    }
+
+  }
+
+  public static class MyQueryScorer extends QueryScorer {
+
+    public MyQueryScorer(Query query, String field, String defaultField) {
+      super(query, field, defaultField);
+    }
+
+    @Override
+    protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
+      return defaultField == null ? new MyWeightedSpanTermExtractor()
+          : new MyWeightedSpanTermExtractor(defaultField);
+    }
+
+  }
+
+  public static class CustomQuery extends Query {
+    private final Term term;
+
+    public CustomQuery(Term term) {
+      super();
+      this.term = term;
+    }
+
+    @Override
+    public String toString(String field) {
+      return new TermQuery(term).toString(field);
+    }
+
+    @Override
+    public Query rewrite(IndexReader reader) throws IOException {
+      return new TermQuery(term);
+    }
+
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = super.hashCode();
+      result = prime * result + ((term == null) ? 0 : term.hashCode());
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj)
+        return true;
+      if (!super.equals(obj))
+        return false;
+      if (getClass() != obj.getClass())
+        return false;
+      CustomQuery other = (CustomQuery) obj;
+      if (term == null) {
+        if (other.term != null)
+          return false;
+      } else if (!term.equals(other.term))
+        return false;
+      return true;
+    }
+
+  }
+}