mirror of https://github.com/apache/lucene.git
LUCENE-3665: Make WeightedSpanTermExtractor extensible to handle custom query implemenations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1226417 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d5932e1149
commit
33a3c50b02
|
@ -0,0 +1,31 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Utility class to record Positions Spans
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class PositionSpan {
|
||||
int start;
|
||||
int end;
|
||||
|
||||
public PositionSpan(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
}
|
|
@ -207,8 +207,7 @@ public class QueryScorer implements Scorer {
|
|||
}
|
||||
|
||||
private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
|
||||
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
|
||||
: new WeightedSpanTermExtractor(defaultField);
|
||||
WeightedSpanTermExtractor qse = newTermExtractor(defaultField);
|
||||
qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
|
||||
qse.setExpandMultiTermQuery(expandMultiTermQuery);
|
||||
qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
|
||||
|
@ -225,6 +224,11 @@ public class QueryScorer implements Scorer {
|
|||
|
||||
return null;
|
||||
}
|
||||
|
||||
protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
|
||||
return defaultField == null ? new WeightedSpanTermExtractor()
|
||||
: new WeightedSpanTermExtractor(defaultField);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
|
|
|
@ -89,16 +89,7 @@ public class WeightedSpanTerm extends WeightedTerm{
|
|||
public List<PositionSpan> getPositionSpans() {
|
||||
return positionSpans;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Utility class to store a Span
|
||||
class PositionSpan {
|
||||
int start;
|
||||
int end;
|
||||
|
||||
public PositionSpan(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ public class WeightedSpanTermExtractor {
|
|||
* Map to place created WeightedSpanTerms in
|
||||
* @throws IOException
|
||||
*/
|
||||
private void extract(Query query, Map<String,WeightedSpanTerm> terms) throws IOException {
|
||||
protected void extract(Query query, Map<String,WeightedSpanTerm> terms) throws IOException {
|
||||
if (query instanceof BooleanQuery) {
|
||||
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
|
||||
|
||||
|
@ -206,6 +206,12 @@ public class WeightedSpanTermExtractor {
|
|||
extractWeightedSpanTerms(terms, sp);
|
||||
}
|
||||
}
|
||||
extractUnknownQuery(query, terms);
|
||||
}
|
||||
|
||||
protected void extractUnknownQuery(Query query,
|
||||
Map<String, WeightedSpanTerm> terms) throws IOException {
|
||||
// for sub-classing to extract custom queries
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -217,7 +223,7 @@ public class WeightedSpanTermExtractor {
|
|||
* SpanQuery to extract Terms from
|
||||
* @throws IOException
|
||||
*/
|
||||
private void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery) throws IOException {
|
||||
protected void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery) throws IOException {
|
||||
Set<String> fieldNames;
|
||||
|
||||
if (fieldName == null) {
|
||||
|
@ -305,7 +311,7 @@ public class WeightedSpanTermExtractor {
|
|||
* Query to extract Terms from
|
||||
* @throws IOException
|
||||
*/
|
||||
private void extractWeightedTerms(Map<String,WeightedSpanTerm> terms, Query query) throws IOException {
|
||||
protected void extractWeightedTerms(Map<String,WeightedSpanTerm> terms, Query query) throws IOException {
|
||||
Set<Term> nonWeightedTerms = new HashSet<Term>();
|
||||
query.extractTerms(nonWeightedTerms);
|
||||
|
||||
|
@ -321,13 +327,13 @@ public class WeightedSpanTermExtractor {
|
|||
/**
|
||||
* Necessary to implement matches for queries against <code>defaultField</code>
|
||||
*/
|
||||
private boolean fieldNameComparator(String fieldNameToCheck) {
|
||||
protected boolean fieldNameComparator(String fieldNameToCheck) {
|
||||
boolean rv = fieldName == null || fieldName.equals(fieldNameToCheck)
|
||||
|| (defaultField != null && defaultField.equals(fieldNameToCheck));
|
||||
return rv;
|
||||
}
|
||||
|
||||
private AtomicReaderContext getLeafContextForField(String field) throws IOException {
|
||||
protected AtomicReaderContext getLeafContextForField(String field) throws IOException {
|
||||
if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
|
||||
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
cachedTokenStream = true;
|
||||
|
@ -449,7 +455,7 @@ public class WeightedSpanTermExtractor {
|
|||
return terms;
|
||||
}
|
||||
|
||||
private void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
|
||||
protected void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
|
||||
if (spanQuery instanceof FieldMaskingSpanQuery) {
|
||||
collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
|
||||
} else if (spanQuery instanceof SpanFirstQuery) {
|
||||
|
@ -469,7 +475,7 @@ public class WeightedSpanTermExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
private boolean mustRewriteQuery(SpanQuery spanQuery) {
|
||||
protected boolean mustRewriteQuery(SpanQuery spanQuery) {
|
||||
if (!expandMultiTermQuery) {
|
||||
return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
|
||||
} else if (spanQuery instanceof FieldMaskingSpanQuery) {
|
||||
|
@ -504,7 +510,8 @@ public class WeightedSpanTermExtractor {
|
|||
* This class makes sure that if both position sensitive and insensitive
|
||||
* versions of the same term are added, the position insensitive one wins.
|
||||
*/
|
||||
static private class PositionCheckingMap<K> extends HashMap<K,WeightedSpanTerm> {
|
||||
@SuppressWarnings("serial")
|
||||
protected static class PositionCheckingMap<K> extends HashMap<K,WeightedSpanTerm> {
|
||||
|
||||
@Override
|
||||
public void putAll(Map<? extends K,? extends WeightedSpanTerm> m) {
|
||||
|
|
|
@ -0,0 +1,186 @@
|
|||
package org.apache.lucene.search.highlight.custom;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleFragmenter;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||
import org.apache.lucene.search.highlight.WeightedSpanTerm;
|
||||
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Tests the extensibility of {@link WeightedSpanTermExtractor} and
|
||||
* {@link QueryScorer} in a user defined package
|
||||
*/
|
||||
public class HighlightCustomQueryTest extends LuceneTestCase {
|
||||
|
||||
private static final String FIELD_NAME = "contents";
|
||||
|
||||
public void testHighlightCustomQuery() throws IOException,
|
||||
InvalidTokenOffsetsException {
|
||||
String s1 = "I call our world Flatland, not because we call it so,";
|
||||
|
||||
// Verify that a query against the default field results in text being
|
||||
// highlighted
|
||||
// regardless of the field name.
|
||||
|
||||
CustomQuery q = new CustomQuery(new Term(FIELD_NAME, "world"));
|
||||
|
||||
String expected = "I call our <B>world</B> Flatland, not because we call it so,";
|
||||
String observed = highlightField(q, "SOME_FIELD_NAME", s1);
|
||||
if (VERBOSE)
|
||||
System.out.println("Expected: \"" + expected + "\n" + "Observed: \""
|
||||
+ observed);
|
||||
assertEquals(
|
||||
"Query in the default field results in text for *ANY* field being highlighted",
|
||||
expected, observed);
|
||||
|
||||
// Verify that a query against a named field does not result in any
|
||||
// highlighting
|
||||
// when the query field name differs from the name of the field being
|
||||
// highlighted,
|
||||
// which in this example happens to be the default field name.
|
||||
q = new CustomQuery(new Term("text", "world"));
|
||||
|
||||
expected = s1;
|
||||
observed = highlightField(q, FIELD_NAME, s1);
|
||||
if (VERBOSE)
|
||||
System.out.println("Expected: \"" + expected + "\n" + "Observed: \""
|
||||
+ observed);
|
||||
assertEquals(
|
||||
"Query in a named field does not result in highlighting when that field isn't in the query",
|
||||
s1, highlightField(q, FIELD_NAME, s1));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* This method intended for use with
|
||||
* <tt>testHighlightingWithDefaultField()</tt>
|
||||
*
|
||||
* @throws InvalidTokenOffsetsException
|
||||
*/
|
||||
private static String highlightField(Query query, String fieldName,
|
||||
String text) throws IOException, InvalidTokenOffsetsException {
|
||||
TokenStream tokenStream = new MockAnalyzer(random, MockTokenizer.SIMPLE,
|
||||
true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
|
||||
new StringReader(text));
|
||||
// Assuming "<B>", "</B>" used to highlight
|
||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||
MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(formatter, scorer);
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
|
||||
|
||||
String rv = highlighter.getBestFragments(tokenStream, text, 1,
|
||||
"(FIELD TEXT TRUNCATED)");
|
||||
return rv.length() == 0 ? text : rv;
|
||||
}
|
||||
|
||||
public static class MyWeightedSpanTermExtractor extends
|
||||
WeightedSpanTermExtractor {
|
||||
|
||||
public MyWeightedSpanTermExtractor() {
|
||||
super();
|
||||
}
|
||||
|
||||
public MyWeightedSpanTermExtractor(String defaultField) {
|
||||
super(defaultField);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void extractUnknownQuery(Query query,
|
||||
Map<String, WeightedSpanTerm> terms) throws IOException {
|
||||
if (query instanceof CustomQuery) {
|
||||
extractWeightedTerms(terms, new TermQuery(((CustomQuery) query).term));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class MyQueryScorer extends QueryScorer {
|
||||
|
||||
public MyQueryScorer(Query query, String field, String defaultField) {
|
||||
super(query, field, defaultField);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
|
||||
return defaultField == null ? new MyWeightedSpanTermExtractor()
|
||||
: new MyWeightedSpanTermExtractor(defaultField);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class CustomQuery extends Query {
|
||||
private final Term term;
|
||||
|
||||
public CustomQuery(Term term) {
|
||||
super();
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return new TermQuery(term).toString(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
return new TermQuery(term);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = super.hashCode();
|
||||
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (!super.equals(obj))
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
CustomQuery other = (CustomQuery) obj;
|
||||
if (term == null) {
|
||||
if (other.term != null)
|
||||
return false;
|
||||
} else if (!term.equals(other.term))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue