From 88d3d419923d81fc06397db4afba79080fa4b026 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Thu, 29 Oct 2009 16:41:04 +0000 Subject: [PATCH] LUCENE-2013: SpanRegexQuery does not work with QueryScorer. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@831024 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + contrib/highlighter/build.xml | 12 ++- .../highlight/WeightedSpanTermExtractor.java | 83 +++++++++++++++++-- .../search/highlight/HighlighterTest.java | 27 ++++++ 4 files changed, 116 insertions(+), 9 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 684682e393c..85cda451c91 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -160,6 +160,9 @@ Bug fixes * LUCENE-2004: Fix Constants.LUCENE_MAIN_VERSION to not be inlined by client code. (Uwe Schindler) + +* LUCENE-2013: SpanRegexQuery does not work with QueryScorer. + (Benjamin Keil via Mark Miller) New features diff --git a/contrib/highlighter/build.xml b/contrib/highlighter/build.xml index 2acee3f9fe8..bcaee11c09e 100644 --- a/contrib/highlighter/build.xml +++ b/contrib/highlighter/build.xml @@ -27,18 +27,26 @@ + + + + - + Highlighter building dependency ${memory.jar} - + + + Highlighter building dependency ${regex.jar} + + diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index ae5253ab054..b479e82a47b 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -32,7 +32,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.search.*; +import org.apache.lucene.search.spans.FieldMaskingSpanQuery; +import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; @@ -220,16 +223,11 @@ public class WeightedSpanTermExtractor { * @throws IOException */ private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException { - Set nonWeightedTerms = new HashSet(); - spanQuery.extractTerms(nonWeightedTerms); - Set fieldNames; if (fieldName == null) { fieldNames = new HashSet(); - for (final Term queryTerm : nonWeightedTerms) { - fieldNames.add(queryTerm.field()); - } + collectSpanQueryFields(spanQuery, fieldNames); } else { fieldNames = new HashSet(1); fieldNames.add(fieldName); @@ -238,13 +236,33 @@ public class WeightedSpanTermExtractor { if (defaultField != null) { fieldNames.add(defaultField); } + + Map queries = new HashMap(); + + Set nonWeightedTerms = new HashSet(); + final boolean mustRewriteQuery = mustRewriteQuery(spanQuery); + if (mustRewriteQuery) { + for (final String field : fieldNames) { + final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getReaderForField(field)); + queries.put(field, rewrittenQuery); + rewrittenQuery.extractTerms(nonWeightedTerms); + } + } else { + spanQuery.extractTerms(nonWeightedTerms); + } List spanPositions = new ArrayList(); for (final String field : fieldNames) { IndexReader reader = getReaderForField(field); - Spans spans = spanQuery.getSpans(reader); + final Spans spans; + if (mustRewriteQuery) { + spans = queries.get(field).getSpans(reader); + } else { + spans = spanQuery.getSpans(reader); + } + // collect span positions while (spans.next()) { @@ -429,6 +447,57 @@ public class WeightedSpanTermExtractor { return terms; } + private void collectSpanQueryFields(SpanQuery spanQuery, Set fieldNames) { + if (spanQuery instanceof FieldMaskingSpanQuery) { + collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames); + } else if (spanQuery instanceof SpanFirstQuery) { + collectSpanQueryFields(((SpanFirstQuery)spanQuery).getMatch(), fieldNames); + } else if (spanQuery instanceof SpanNearQuery) { + for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) { + collectSpanQueryFields(clause, fieldNames); + } + } else if (spanQuery instanceof SpanNotQuery) { + collectSpanQueryFields(((SpanNotQuery)spanQuery).getInclude(), fieldNames); + } else if (spanQuery instanceof SpanOrQuery) { + for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) { + collectSpanQueryFields(clause, fieldNames); + } + } else { + fieldNames.add(spanQuery.getField()); + } + } + + private boolean mustRewriteQuery(SpanQuery spanQuery) { + if (!expandMultiTermQuery) { + return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery. + } else if (spanQuery instanceof FieldMaskingSpanQuery) { + return mustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery()); + } else if (spanQuery instanceof SpanFirstQuery) { + return mustRewriteQuery(((SpanFirstQuery)spanQuery).getMatch()); + } else if (spanQuery instanceof SpanNearQuery) { + for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) { + if (mustRewriteQuery(clause)) { + return true; + } + } + return false; + } else if (spanQuery instanceof SpanNotQuery) { + SpanNotQuery spanNotQuery = (SpanNotQuery)spanQuery; + return mustRewriteQuery(spanNotQuery.getInclude()) || mustRewriteQuery(spanNotQuery.getExclude()); + } else if (spanQuery instanceof SpanOrQuery) { + for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) { + if (mustRewriteQuery(clause)) { + return true; + } + } + return false; + } else if (spanQuery instanceof SpanTermQuery) { + return false; + } else { + return true; + } + } + /** * This class makes sure that if both position sensitive and insensitive * versions of the same term are added, the position insensitive one wins. diff --git a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index 64c0bd508ad..a98d0b1e82e 100644 --- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -70,8 +70,10 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; +import org.apache.lucene.search.regex.SpanRegexQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; @@ -306,6 +308,31 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } + public void testSpanRegexQuery() throws Exception { + query = new SpanOrQuery(new SpanQuery [] { + new SpanRegexQuery(new Term(FIELD_NAME, "ken.*")) }); + searcher = new IndexSearcher(ramDir, true); + hits = searcher.search(query, 100); + int maxNumFragmentsRequired = 2; + + QueryScorer scorer = new QueryScorer(query, FIELD_NAME); + Highlighter highlighter = new Highlighter(this, scorer); + + for (int i = 0; i < hits.totalHits; i++) { + String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + "..."); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } + public void testNumericRangeQuery() throws Exception { // doesn't currently highlight, but make sure it doesn't cause exception either query = NumericRangeQuery.newIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true);