LUCENE-2013: SpanRegexQuery does not work with QueryScorer.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@831024 13f79535-47bb-0310-9956-ffa450edef68
2009-10-29 16:41:04 +00:00 · 2009-10-29 16:41:04 +00:00 · 88d3d41992
parent 1b38f9c24d
commit 88d3d41992
4 changed files with 116 additions and 9 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -161,6 +161,9 @@ Bug fixes
 * LUCENE-2004: Fix Constants.LUCENE_MAIN_VERSION to not be inlined
  by client code.  (Uwe Schindler)
  
+* LUCENE-2013: SpanRegexQuery does not work with QueryScorer.
+  (Benjamin Keil via Mark Miller)
+
 New features

 * LUCENE-1933: Provide a convenience AttributeFactory that creates a
--- a/contrib/highlighter/build.xml
+++ b/contrib/highlighter/build.xml
@ -28,17 +28,25 @@
  <property name="memory.jar" location="${common.dir}/build/contrib/memory/lucene-memory-${version}.jar"/>
  <available property="memory.jar.present" type="file" file="${memory.jar}"/>
  
+  <property name="regex.jar" location="${common.dir}/build/contrib/regex/lucene-regex-${version}.jar"/>
+  <available property="regex.jar.present" type="file" file="${regex.jar}"/>
+
  <path id="classpath">
    <pathelement path="${lucene.jar}"/>
    <pathelement path="${memory.jar}"/>
+    <pathelement path="${regex.jar}"/>
    <pathelement path="${project.classpath}"/>
  </path>

-  <target name="compile-core" depends="build-memory, common.compile-core" />
+  <target name="compile-core" depends="build-memory, build-regex, common.compile-core" />

  <target name="build-memory" unless="memory.jar.present">
    <echo>Highlighter building dependency ${memory.jar}</echo>
    <ant antfile="../memory/build.xml" target="default" inheritall="false" dir="../memory" />
  </target>
  
+  <target name="build-regex" unless="regex.jar.present">
+    <echo>Highlighter building dependency ${regex.jar}</echo>
+    <ant antfile="../regex/build.xml" target="default" inheritall="false" dir="../regex" />
+  </target>
 </project>
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -32,7 +32,10 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.search.*;
+import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
+import org.apache.lucene.search.spans.SpanFirstQuery;
 import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
 import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
@ -220,16 +223,11 @@ public class WeightedSpanTermExtractor {
   * @throws IOException
   */
  private void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery) throws IOException {
-    Set<Term> nonWeightedTerms = new HashSet<Term>();
-    spanQuery.extractTerms(nonWeightedTerms);
-
    Set<String> fieldNames;

    if (fieldName == null) {
      fieldNames = new HashSet<String>();
-      for (final Term queryTerm : nonWeightedTerms) {
-        fieldNames.add(queryTerm.field());
-      }
+      collectSpanQueryFields(spanQuery, fieldNames);
    } else {
      fieldNames = new HashSet<String>(1);
      fieldNames.add(fieldName);
@ -239,12 +237,32 @@ public class WeightedSpanTermExtractor {
      fieldNames.add(defaultField);
    }
    
+    Map<String, SpanQuery> queries = new HashMap<String, SpanQuery>();
+ 
+    Set<Term> nonWeightedTerms = new HashSet<Term>();
+    final boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
+    if (mustRewriteQuery) {
+      for (final String field : fieldNames) {
+        final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getReaderForField(field));
+        queries.put(field, rewrittenQuery);
+        rewrittenQuery.extractTerms(nonWeightedTerms);
+      }
+    } else {
+      spanQuery.extractTerms(nonWeightedTerms);
+    }
+
    List<PositionSpan> spanPositions = new ArrayList<PositionSpan>();

    for (final String field : fieldNames) {

      IndexReader reader = getReaderForField(field);
-      Spans spans = spanQuery.getSpans(reader);
+      final Spans spans;
+      if (mustRewriteQuery) {
+        spans = queries.get(field).getSpans(reader);
+      } else {
+        spans = spanQuery.getSpans(reader);
+      }
+

      // collect span positions
      while (spans.next()) {
@ -429,6 +447,57 @@ public class WeightedSpanTermExtractor {
    return terms;
  }
  
+  private void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
+    if (spanQuery instanceof FieldMaskingSpanQuery) {
+      collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
+    } else if (spanQuery instanceof SpanFirstQuery) {
+      collectSpanQueryFields(((SpanFirstQuery)spanQuery).getMatch(), fieldNames);
+    } else if (spanQuery instanceof SpanNearQuery) {
+      for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
+        collectSpanQueryFields(clause, fieldNames);
+      }
+    } else if (spanQuery instanceof SpanNotQuery) {
+      collectSpanQueryFields(((SpanNotQuery)spanQuery).getInclude(), fieldNames);
+    } else if (spanQuery instanceof SpanOrQuery) {
+      for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
+        collectSpanQueryFields(clause, fieldNames);
+      }
+    } else {
+      fieldNames.add(spanQuery.getField());
+    }
+  }
+  
+  private boolean mustRewriteQuery(SpanQuery spanQuery) {
+    if (!expandMultiTermQuery) {
+      return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
+    } else if (spanQuery instanceof FieldMaskingSpanQuery) {
+      return mustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery());
+    } else if (spanQuery instanceof SpanFirstQuery) {
+      return mustRewriteQuery(((SpanFirstQuery)spanQuery).getMatch());
+    } else if (spanQuery instanceof SpanNearQuery) {
+      for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
+        if (mustRewriteQuery(clause)) {
+          return true;
+        }
+      }
+      return false; 
+    } else if (spanQuery instanceof SpanNotQuery) {
+      SpanNotQuery spanNotQuery = (SpanNotQuery)spanQuery;
+      return mustRewriteQuery(spanNotQuery.getInclude()) || mustRewriteQuery(spanNotQuery.getExclude());
+    } else if (spanQuery instanceof SpanOrQuery) {
+      for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
+        if (mustRewriteQuery(clause)) {
+          return true;
+        }
+      }
+      return false; 
+    } else if (spanQuery instanceof SpanTermQuery) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+  
  /**
   * This class makes sure that if both position sensitive and insensitive
   * versions of the same term are added, the position insensitive one wins.
--- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -70,8 +70,10 @@ import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
+import org.apache.lucene.search.regex.SpanRegexQuery;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
@ -306,6 +308,31 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
    
  }
  
+  public void testSpanRegexQuery() throws Exception {
+    query = new SpanOrQuery(new SpanQuery [] {
+        new SpanRegexQuery(new Term(FIELD_NAME, "ken.*")) });
+    searcher = new IndexSearcher(ramDir, true);
+    hits = searcher.search(query, 100);
+    int maxNumFragmentsRequired = 2;
+
+    QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+    Highlighter highlighter = new Highlighter(this, scorer);
+    
+    for (int i = 0; i < hits.totalHits; i++) {
+      String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+      TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+      highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+      String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+          "...");
+      System.out.println("\t" + result);
+    }
+    
+    assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+        numHighlights == 5);
+  }
+  
  public void testNumericRangeQuery() throws Exception {
    // doesn't currently highlight, but make sure it doesn't cause exception either
    query = NumericRangeQuery.newIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true);