mirror of https://github.com/apache/lucene.git
LUCENE-1285: WeightedSpanTermExtractor incorrectly treats the same terms occurring in different query types
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@659965 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
33aea48b02
commit
f32b5a5698
|
@ -1,433 +1,460 @@
|
||||||
package org.apache.lucene.search.highlight;
|
package org.apache.lucene.search.highlight;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
* (the "License"); you may not use this file except in compliance with
|
* (the "License"); you may not use this file except in compliance with
|
||||||
* the License. You may obtain a copy of the License at
|
* the License. You may obtain a copy of the License at
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.FilterIndexReader;
|
import org.apache.lucene.index.FilterIndexReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermEnum;
|
import org.apache.lucene.index.TermEnum;
|
||||||
import org.apache.lucene.index.memory.MemoryIndex;
|
import org.apache.lucene.index.memory.MemoryIndex;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.ConstantScoreRangeQuery;
|
import org.apache.lucene.search.ConstantScoreRangeQuery;
|
||||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||||
import org.apache.lucene.search.FilteredQuery;
|
import org.apache.lucene.search.FilteredQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.MultiPhraseQuery;
|
import org.apache.lucene.search.MultiPhraseQuery;
|
||||||
import org.apache.lucene.search.PhraseQuery;
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
import org.apache.lucene.search.spans.Spans;
|
import org.apache.lucene.search.spans.Spans;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether Terms from the query are contained in a supplied TokenStream.
|
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether Terms from the query are contained in a supplied TokenStream.
|
||||||
*/
|
*/
|
||||||
public class WeightedSpanTermExtractor {
|
public class WeightedSpanTermExtractor {
|
||||||
|
|
||||||
private String fieldName;
|
private String fieldName;
|
||||||
private CachingTokenFilter cachedTokenFilter;
|
private CachingTokenFilter cachedTokenFilter;
|
||||||
private Map readers = new HashMap(10); // Map<String, IndexReader>
|
private Map readers = new HashMap(10); // Map<String, IndexReader>
|
||||||
private String defaultField;
|
private String defaultField;
|
||||||
private boolean highlightCnstScrRngQuery;
|
private boolean highlightCnstScrRngQuery;
|
||||||
|
|
||||||
public WeightedSpanTermExtractor() {
|
public WeightedSpanTermExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public WeightedSpanTermExtractor(String defaultField) {
|
public WeightedSpanTermExtractor(String defaultField) {
|
||||||
if (defaultField != null) {
|
if (defaultField != null) {
|
||||||
this.defaultField = defaultField.intern();
|
this.defaultField = defaultField.intern();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void closeReaders() {
|
private void closeReaders() {
|
||||||
Collection readerSet = readers.values();
|
Collection readerSet = readers.values();
|
||||||
Iterator it = readerSet.iterator();
|
Iterator it = readerSet.iterator();
|
||||||
|
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
IndexReader reader = (IndexReader) it.next();
|
IndexReader reader = (IndexReader) it.next();
|
||||||
try {
|
try {
|
||||||
reader.close();
|
reader.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// alert?
|
// alert?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
||||||
*
|
*
|
||||||
* @param query
|
* @param query
|
||||||
* Query to extract Terms from
|
* Query to extract Terms from
|
||||||
* @param terms
|
* @param terms
|
||||||
* Map to place created WeightedSpanTerms in
|
* Map to place created WeightedSpanTerms in
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private void extract(Query query, Map terms) throws IOException {
|
private void extract(Query query, Map terms) throws IOException {
|
||||||
if (query instanceof BooleanQuery) {
|
if (query instanceof BooleanQuery) {
|
||||||
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
|
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
|
||||||
Map booleanTerms = new HashMap();
|
Map booleanTerms = new PositionCheckingMap();
|
||||||
for (int i = 0; i < queryClauses.length; i++) {
|
for (int i = 0; i < queryClauses.length; i++) {
|
||||||
if (!queryClauses[i].isProhibited()) {
|
if (!queryClauses[i].isProhibited()) {
|
||||||
extract(queryClauses[i].getQuery(), booleanTerms);
|
extract(queryClauses[i].getQuery(), booleanTerms);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
terms.putAll(booleanTerms);
|
terms.putAll(booleanTerms);
|
||||||
} else if (query instanceof PhraseQuery) {
|
} else if (query instanceof PhraseQuery) {
|
||||||
Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms();
|
Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms();
|
||||||
SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
|
SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
|
||||||
for (int i = 0; i < phraseQueryTerms.length; i++) {
|
for (int i = 0; i < phraseQueryTerms.length; i++) {
|
||||||
clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
|
clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
int slop = ((PhraseQuery) query).getSlop();
|
int slop = ((PhraseQuery) query).getSlop();
|
||||||
boolean inorder = false;
|
boolean inorder = false;
|
||||||
|
|
||||||
if (slop == 0) {
|
if (slop == 0) {
|
||||||
inorder = true;
|
inorder = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
|
SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
|
||||||
sp.setBoost(query.getBoost());
|
sp.setBoost(query.getBoost());
|
||||||
extractWeightedSpanTerms(terms, sp);
|
extractWeightedSpanTerms(terms, sp);
|
||||||
} else if (query instanceof TermQuery) {
|
} else if (query instanceof TermQuery) {
|
||||||
extractWeightedTerms(terms, query);
|
extractWeightedTerms(terms, query);
|
||||||
} else if (query instanceof SpanQuery) {
|
} else if (query instanceof SpanQuery) {
|
||||||
extractWeightedSpanTerms(terms, (SpanQuery) query);
|
extractWeightedSpanTerms(terms, (SpanQuery) query);
|
||||||
} else if (query instanceof FilteredQuery) {
|
} else if (query instanceof FilteredQuery) {
|
||||||
extract(((FilteredQuery) query).getQuery(), terms);
|
extract(((FilteredQuery) query).getQuery(), terms);
|
||||||
} else if (query instanceof DisjunctionMaxQuery) {
|
} else if (query instanceof DisjunctionMaxQuery) {
|
||||||
Map disjunctTerms = new HashMap();
|
Map disjunctTerms = new PositionCheckingMap();
|
||||||
for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
|
for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
|
||||||
extract((Query) iterator.next(), disjunctTerms);
|
extract((Query) iterator.next(), disjunctTerms);
|
||||||
}
|
}
|
||||||
terms.putAll(disjunctTerms);
|
terms.putAll(disjunctTerms);
|
||||||
} else if (query instanceof MultiPhraseQuery) {
|
} else if (query instanceof MultiPhraseQuery) {
|
||||||
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
|
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
|
||||||
final List termArrays = mpq.getTermArrays();
|
final List termArrays = mpq.getTermArrays();
|
||||||
final int[] positions = mpq.getPositions();
|
final int[] positions = mpq.getPositions();
|
||||||
if (positions.length > 0) {
|
if (positions.length > 0) {
|
||||||
|
|
||||||
int maxPosition = positions[positions.length - 1];
|
int maxPosition = positions[positions.length - 1];
|
||||||
for (int i = 0; i < positions.length - 1; ++i) {
|
for (int i = 0; i < positions.length - 1; ++i) {
|
||||||
if (positions[i] > maxPosition) {
|
if (positions[i] > maxPosition) {
|
||||||
maxPosition = positions[i];
|
maxPosition = positions[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final List[] disjunctLists = new List[maxPosition + 1];
|
final List[] disjunctLists = new List[maxPosition + 1];
|
||||||
int distinctPositions = 0;
|
int distinctPositions = 0;
|
||||||
|
|
||||||
for (int i = 0; i < termArrays.size(); ++i) {
|
for (int i = 0; i < termArrays.size(); ++i) {
|
||||||
final Term[] termArray = (Term[]) termArrays.get(i);
|
final Term[] termArray = (Term[]) termArrays.get(i);
|
||||||
List disjuncts = disjunctLists[positions[i]];
|
List disjuncts = disjunctLists[positions[i]];
|
||||||
if (disjuncts == null) {
|
if (disjuncts == null) {
|
||||||
disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length));
|
disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length));
|
||||||
++distinctPositions;
|
++distinctPositions;
|
||||||
}
|
}
|
||||||
for (int j = 0; j < termArray.length; ++j) {
|
for (int j = 0; j < termArray.length; ++j) {
|
||||||
disjuncts.add(new SpanTermQuery(termArray[j]));
|
disjuncts.add(new SpanTermQuery(termArray[j]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int positionGaps = 0;
|
int positionGaps = 0;
|
||||||
int position = 0;
|
int position = 0;
|
||||||
final SpanQuery[] clauses = new SpanQuery[distinctPositions];
|
final SpanQuery[] clauses = new SpanQuery[distinctPositions];
|
||||||
for (int i = 0; i < disjunctLists.length; ++i) {
|
for (int i = 0; i < disjunctLists.length; ++i) {
|
||||||
List disjuncts = disjunctLists[i];
|
List disjuncts = disjunctLists[i];
|
||||||
if (disjuncts != null) {
|
if (disjuncts != null) {
|
||||||
clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts
|
clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts
|
||||||
.toArray(new SpanQuery[disjuncts.size()]));
|
.toArray(new SpanQuery[disjuncts.size()]));
|
||||||
} else {
|
} else {
|
||||||
++positionGaps;
|
++positionGaps;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final int slop = mpq.getSlop();
|
final int slop = mpq.getSlop();
|
||||||
final boolean inorder = (slop == 0);
|
final boolean inorder = (slop == 0);
|
||||||
|
|
||||||
SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
|
SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
|
||||||
sp.setBoost(query.getBoost());
|
sp.setBoost(query.getBoost());
|
||||||
extractWeightedSpanTerms(terms, sp);
|
extractWeightedSpanTerms(terms, sp);
|
||||||
}
|
}
|
||||||
} else if (query instanceof ConstantScoreRangeQuery) {
|
} else if (query instanceof ConstantScoreRangeQuery) {
|
||||||
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
|
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
|
||||||
Term lower = new Term(fieldName, q.getLowerVal());
|
Term lower = new Term(fieldName, q.getLowerVal());
|
||||||
Term upper = new Term(fieldName, q.getUpperVal());
|
Term upper = new Term(fieldName, q.getUpperVal());
|
||||||
FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
|
FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
|
||||||
try {
|
try {
|
||||||
TermEnum te = fir.terms(lower);
|
TermEnum te = fir.terms(lower);
|
||||||
BooleanQuery bq = new BooleanQuery();
|
BooleanQuery bq = new BooleanQuery();
|
||||||
do {
|
do {
|
||||||
Term term = te.term();
|
Term term = te.term();
|
||||||
if (term != null && upper.compareTo(term) >= 0) {
|
if (term != null && upper.compareTo(term) >= 0) {
|
||||||
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} while (te.next());
|
} while (te.next());
|
||||||
extract(bq, terms);
|
extract(bq, terms);
|
||||||
} finally {
|
} finally {
|
||||||
fir.close();
|
fir.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>SpanQuery</code>.
|
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>SpanQuery</code>.
|
||||||
*
|
*
|
||||||
* @param terms
|
* @param terms
|
||||||
* Map to place created WeightedSpanTerms in
|
* Map to place created WeightedSpanTerms in
|
||||||
* @param spanQuery
|
* @param spanQuery
|
||||||
* SpanQuery to extract Terms from
|
* SpanQuery to extract Terms from
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException {
|
private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException {
|
||||||
Set nonWeightedTerms = new HashSet();
|
Set nonWeightedTerms = new HashSet();
|
||||||
spanQuery.extractTerms(nonWeightedTerms);
|
spanQuery.extractTerms(nonWeightedTerms);
|
||||||
|
|
||||||
Set fieldNames;
|
Set fieldNames;
|
||||||
|
|
||||||
if (fieldName == null) {
|
if (fieldName == null) {
|
||||||
fieldNames = new HashSet();
|
fieldNames = new HashSet();
|
||||||
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
||||||
Term queryTerm = (Term) iter.next();
|
Term queryTerm = (Term) iter.next();
|
||||||
fieldNames.add(queryTerm.field());
|
fieldNames.add(queryTerm.field());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fieldNames = new HashSet(1);
|
fieldNames = new HashSet(1);
|
||||||
fieldNames.add(fieldName);
|
fieldNames.add(fieldName);
|
||||||
}
|
}
|
||||||
// To support the use of the default field name
|
// To support the use of the default field name
|
||||||
if (defaultField != null) {
|
if (defaultField != null) {
|
||||||
fieldNames.add(defaultField);
|
fieldNames.add(defaultField);
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator it = fieldNames.iterator();
|
Iterator it = fieldNames.iterator();
|
||||||
List spanPositions = new ArrayList();
|
List spanPositions = new ArrayList();
|
||||||
|
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
String field = (String) it.next();
|
String field = (String) it.next();
|
||||||
|
|
||||||
IndexReader reader = getReaderForField(field);
|
IndexReader reader = getReaderForField(field);
|
||||||
Spans spans = spanQuery.getSpans(reader);
|
Spans spans = spanQuery.getSpans(reader);
|
||||||
|
|
||||||
// collect span positions
|
// collect span positions
|
||||||
while (spans.next()) {
|
while (spans.next()) {
|
||||||
spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1));
|
spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
cachedTokenFilter.reset();
|
cachedTokenFilter.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (spanPositions.size() == 0) {
|
if (spanPositions.size() == 0) {
|
||||||
// no spans found
|
// no spans found
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
||||||
Term queryTerm = (Term) iter.next();
|
Term queryTerm = (Term) iter.next();
|
||||||
|
|
||||||
if (fieldNameComparator(queryTerm.field())) {
|
if (fieldNameComparator(queryTerm.field())) {
|
||||||
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(queryTerm.text());
|
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(queryTerm.text());
|
||||||
|
|
||||||
if (weightedSpanTerm == null) {
|
if (weightedSpanTerm == null) {
|
||||||
weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text());
|
weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text());
|
||||||
weightedSpanTerm.addPositionSpans(spanPositions);
|
weightedSpanTerm.addPositionSpans(spanPositions);
|
||||||
weightedSpanTerm.positionSensitive = true;
|
weightedSpanTerm.positionSensitive = true;
|
||||||
terms.put(queryTerm.text(), weightedSpanTerm);
|
terms.put(queryTerm.text(), weightedSpanTerm);
|
||||||
} else {
|
} else {
|
||||||
if (spanPositions.size() > 0) {
|
if (spanPositions.size() > 0) {
|
||||||
weightedSpanTerm.addPositionSpans(spanPositions);
|
weightedSpanTerm.addPositionSpans(spanPositions);
|
||||||
weightedSpanTerm.positionSensitive = true;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
/**
|
||||||
/**
|
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
||||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
*
|
||||||
*
|
* @param terms
|
||||||
* @param terms
|
* Map to place created WeightedSpanTerms in
|
||||||
* Map to place created WeightedSpanTerms in
|
* @param query
|
||||||
* @param query
|
* Query to extract Terms from
|
||||||
* Query to extract Terms from
|
* @throws IOException
|
||||||
* @throws IOException
|
*/
|
||||||
*/
|
private void extractWeightedTerms(Map terms, Query query) throws IOException {
|
||||||
private void extractWeightedTerms(Map terms, Query query) throws IOException {
|
Set nonWeightedTerms = new HashSet();
|
||||||
Set nonWeightedTerms = new HashSet();
|
query.extractTerms(nonWeightedTerms);
|
||||||
query.extractTerms(nonWeightedTerms);
|
|
||||||
|
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
||||||
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
Term queryTerm = (Term) iter.next();
|
||||||
Term queryTerm = (Term) iter.next();
|
|
||||||
|
if (fieldNameComparator(queryTerm.field())) {
|
||||||
if (fieldNameComparator(queryTerm.field())) {
|
WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text());
|
||||||
WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text());
|
terms.put(queryTerm.text(), weightedSpanTerm);
|
||||||
terms.put(queryTerm.text(), weightedSpanTerm);
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
/**
|
||||||
/**
|
* Necessary to implement matches for queries against <code>defaultField</code>
|
||||||
* Necessary to implement matches for queries against <code>defaultField</code>
|
*/
|
||||||
*/
|
private boolean fieldNameComparator(String fieldNameToCheck) {
|
||||||
private boolean fieldNameComparator(String fieldNameToCheck) {
|
boolean rv = fieldName == null || fieldNameToCheck == fieldName
|
||||||
boolean rv = fieldName == null || fieldNameToCheck == fieldName
|
|| fieldNameToCheck == defaultField;
|
||||||
|| fieldNameToCheck == defaultField;
|
return rv;
|
||||||
return rv;
|
}
|
||||||
}
|
|
||||||
|
private IndexReader getReaderForField(String field) {
|
||||||
private IndexReader getReaderForField(String field) {
|
IndexReader reader = (IndexReader) readers.get(field);
|
||||||
IndexReader reader = (IndexReader) readers.get(field);
|
if (reader == null) {
|
||||||
if (reader == null) {
|
MemoryIndex indexer = new MemoryIndex();
|
||||||
MemoryIndex indexer = new MemoryIndex();
|
indexer.addField(field, cachedTokenFilter);
|
||||||
indexer.addField(field, cachedTokenFilter);
|
IndexSearcher searcher = indexer.createSearcher();
|
||||||
IndexSearcher searcher = indexer.createSearcher();
|
reader = searcher.getIndexReader();
|
||||||
reader = searcher.getIndexReader();
|
readers.put(field, reader);
|
||||||
readers.put(field, reader);
|
}
|
||||||
}
|
return reader;
|
||||||
return reader;
|
}
|
||||||
}
|
|
||||||
|
/**
|
||||||
/**
|
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
|
||||||
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
|
*
|
||||||
*
|
* <p>
|
||||||
* <p>
|
*
|
||||||
*
|
* @param query
|
||||||
* @param query
|
* that caused hit
|
||||||
* that caused hit
|
* @param tokenStream
|
||||||
* @param tokenStream
|
* of text to be highlighted
|
||||||
* of text to be highlighted
|
* @return
|
||||||
* @return
|
* @throws IOException
|
||||||
* @throws IOException
|
*/
|
||||||
*/
|
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter)
|
||||||
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter)
|
throws IOException {
|
||||||
throws IOException {
|
this.fieldName = null;
|
||||||
this.fieldName = null;
|
this.cachedTokenFilter = cachingTokenFilter;
|
||||||
this.cachedTokenFilter = cachingTokenFilter;
|
|
||||||
|
Map terms = new PositionCheckingMap();
|
||||||
Map terms = new HashMap();
|
try {
|
||||||
try {
|
extract(query, terms);
|
||||||
extract(query, terms);
|
} finally {
|
||||||
} finally {
|
closeReaders();
|
||||||
closeReaders();
|
}
|
||||||
}
|
|
||||||
|
return terms;
|
||||||
return terms;
|
}
|
||||||
}
|
|
||||||
|
/**
|
||||||
/**
|
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
|
||||||
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
|
*
|
||||||
*
|
* <p>
|
||||||
* <p>
|
*
|
||||||
*
|
* @param query
|
||||||
* @param query
|
* that caused hit
|
||||||
* that caused hit
|
* @param tokenStream
|
||||||
* @param tokenStream
|
* of text to be highlighted
|
||||||
* of text to be highlighted
|
* @param fieldName
|
||||||
* @param fieldName
|
* restricts Term's used based on field name
|
||||||
* restricts Term's used based on field name
|
* @return
|
||||||
* @return
|
* @throws IOException
|
||||||
* @throws IOException
|
*/
|
||||||
*/
|
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter,
|
||||||
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter,
|
String fieldName) throws IOException {
|
||||||
String fieldName) throws IOException {
|
if (fieldName != null) {
|
||||||
if (fieldName != null) {
|
this.fieldName = fieldName.intern();
|
||||||
this.fieldName = fieldName.intern();
|
}
|
||||||
}
|
|
||||||
|
Map terms = new PositionCheckingMap();
|
||||||
Map terms = new HashMap();
|
this.cachedTokenFilter = cachingTokenFilter;
|
||||||
this.cachedTokenFilter = cachingTokenFilter;
|
try {
|
||||||
try {
|
extract(query, terms);
|
||||||
extract(query, terms);
|
} finally {
|
||||||
} finally {
|
closeReaders();
|
||||||
closeReaders();
|
}
|
||||||
}
|
|
||||||
|
return terms;
|
||||||
return terms;
|
}
|
||||||
}
|
|
||||||
|
/**
|
||||||
/**
|
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
|
||||||
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
|
* <code>IndexReader</code> to properly weight terms (for gradient highlighting).
|
||||||
* <code>IndexReader</code> to properly weight terms (for gradient highlighting).
|
*
|
||||||
*
|
* <p>
|
||||||
* <p>
|
*
|
||||||
*
|
* @param query
|
||||||
* @param query
|
* that caused hit
|
||||||
* that caused hit
|
* @param tokenStream
|
||||||
* @param tokenStream
|
* of text to be highlighted
|
||||||
* of text to be highlighted
|
* @param fieldName
|
||||||
* @param fieldName
|
* restricts Term's used based on field name
|
||||||
* restricts Term's used based on field name
|
* @param reader
|
||||||
* @param reader
|
* to use for scoring
|
||||||
* to use for scoring
|
* @return
|
||||||
* @return
|
* @throws IOException
|
||||||
* @throws IOException
|
*/
|
||||||
*/
|
public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
|
||||||
public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
|
IndexReader reader) throws IOException {
|
||||||
IndexReader reader) throws IOException {
|
this.fieldName = fieldName;
|
||||||
this.fieldName = fieldName;
|
this.cachedTokenFilter = new CachingTokenFilter(tokenStream);
|
||||||
this.cachedTokenFilter = new CachingTokenFilter(tokenStream);
|
|
||||||
|
Map terms = new PositionCheckingMap();
|
||||||
Map terms = new HashMap();
|
extract(query, terms);
|
||||||
extract(query, terms);
|
|
||||||
|
int totalNumDocs = reader.numDocs();
|
||||||
int totalNumDocs = reader.numDocs();
|
Set weightedTerms = terms.keySet();
|
||||||
Set weightedTerms = terms.keySet();
|
Iterator it = weightedTerms.iterator();
|
||||||
Iterator it = weightedTerms.iterator();
|
|
||||||
|
try {
|
||||||
try {
|
while (it.hasNext()) {
|
||||||
while (it.hasNext()) {
|
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
|
||||||
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
|
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
|
||||||
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
|
|
||||||
|
// IDF algorithm taken from DefaultSimilarity class
|
||||||
// IDF algorithm taken from DefaultSimilarity class
|
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
||||||
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
weightedSpanTerm.weight *= idf;
|
||||||
weightedSpanTerm.weight *= idf;
|
}
|
||||||
}
|
} finally {
|
||||||
} finally {
|
|
||||||
|
closeReaders();
|
||||||
closeReaders();
|
}
|
||||||
}
|
|
||||||
|
return terms;
|
||||||
return terms;
|
}
|
||||||
}
|
|
||||||
|
public boolean isHighlightCnstScrRngQuery() {
|
||||||
public boolean isHighlightCnstScrRngQuery() {
|
return highlightCnstScrRngQuery;
|
||||||
return highlightCnstScrRngQuery;
|
}
|
||||||
}
|
|
||||||
|
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
|
||||||
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
|
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
|
||||||
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
|
}
|
||||||
}
|
|
||||||
}
|
/**
|
||||||
|
* This class makes sure that if both position sensitive and insensitive
|
||||||
|
* versions of the same term are added, the position insensitive one wins.
|
||||||
|
*/
|
||||||
|
private class PositionCheckingMap extends HashMap {
|
||||||
|
|
||||||
|
public void putAll(Map m) {
|
||||||
|
Iterator it = m.keySet().iterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
Object key = it.next();
|
||||||
|
Object val = m.get(key);
|
||||||
|
this.put(key, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object put(Object key, Object value) {
|
||||||
|
Object prev = super.put(key, value);
|
||||||
|
if (prev == null) return prev;
|
||||||
|
WeightedSpanTerm prevTerm = (WeightedSpanTerm)prev;
|
||||||
|
WeightedSpanTerm newTerm = (WeightedSpanTerm)value;
|
||||||
|
if (!prevTerm.positionSensitive) {
|
||||||
|
newTerm.positionSensitive = false;
|
||||||
|
}
|
||||||
|
return prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -235,6 +235,30 @@ public class HighlighterTest extends TestCase implements Formatter {
|
||||||
numHighlights == 3);
|
numHighlights == 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// position sensitive query added after position insensitive query
|
||||||
|
public void testPosTermStdTerm() throws Exception {
|
||||||
|
doSearching("y \"x y z\"");
|
||||||
|
|
||||||
|
int maxNumFragmentsRequired = 2;
|
||||||
|
|
||||||
|
for (int i = 0; i < hits.length(); i++) {
|
||||||
|
String text = hits.doc(i).get(FIELD_NAME);
|
||||||
|
CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(FIELD_NAME,
|
||||||
|
new StringReader(text)));
|
||||||
|
Highlighter highlighter = new Highlighter(this,
|
||||||
|
new SpanScorer(query, FIELD_NAME, tokenStream));
|
||||||
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
tokenStream.reset();
|
||||||
|
|
||||||
|
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||||
|
"...");
|
||||||
|
System.out.println("\t" + result);
|
||||||
|
|
||||||
|
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||||
|
numHighlights == 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testSpanMultiPhraseQueryHighlighting() throws Exception {
|
public void testSpanMultiPhraseQueryHighlighting() throws Exception {
|
||||||
MultiPhraseQuery mpq = new MultiPhraseQuery();
|
MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||||
|
|
Loading…
Reference in New Issue