mirror of https://github.com/apache/lucene.git
Commit of LUCENE-794 patch - adding phrase/span query support to highlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@652164 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8f1feaa484
commit
35c7eb36df
|
@ -1,27 +1,29 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="highlighter" default="default">
|
||||
<project name="highlighter" default="buildHighlighter">
|
||||
|
||||
<description>
|
||||
Hits highlighter
|
||||
Hits highlighter
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<property name="memory.jar" location="../../build/contrib/memory/lucene-memory-${version}.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${lucene.jar}"/>
|
||||
<pathelement path="${memory.jar}"/>
|
||||
<pathelement path="${project.classpath}"/>
|
||||
</path>
|
||||
|
||||
|
||||
<target name="buildHighlighter" depends="buildMemory,default" />
|
||||
|
||||
<target name="buildMemory" >
|
||||
<echo>Highlighter building dependency ${memory.jar}</echo>
|
||||
<ant antfile="../memory/build.xml" target="default" inheritall="false"/>
|
||||
</target>
|
||||
|
||||
|
||||
</project>
|
||||
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* {@link Fragmenter} implementation which breaks text up into same-size
|
||||
* fragments but does not split up Spans. This is a simple sample class.
|
||||
*/
|
||||
public class SimpleSpanFragmenter implements Fragmenter {
|
||||
private static final int DEFAULT_FRAGMENT_SIZE = 100;
|
||||
private int fragmentSize;
|
||||
private int currentNumFrags;
|
||||
private int position = -1;
|
||||
private SpanScorer spanScorer;
|
||||
private int waitForPos = -1;
|
||||
|
||||
/**
|
||||
* @param spanscorer SpanScorer that was used to score hits
|
||||
*/
|
||||
public SimpleSpanFragmenter(SpanScorer spanscorer) {
|
||||
this(spanscorer, DEFAULT_FRAGMENT_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param spanscorer SpanScorer that was used to score hits
|
||||
* @param fragmentSize size in bytes of each fragment
|
||||
*/
|
||||
public SimpleSpanFragmenter(SpanScorer spanscorer, int fragmentSize) {
|
||||
this.fragmentSize = fragmentSize;
|
||||
this.spanScorer = spanscorer;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token)
|
||||
*/
|
||||
public boolean isNewFragment(Token token) {
|
||||
position += token.getPositionIncrement();
|
||||
|
||||
if (waitForPos == position) {
|
||||
waitForPos = -1;
|
||||
} else if (waitForPos != -1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
if (wSpanTerm != null) {
|
||||
List positionSpans = wSpanTerm.getPositionSpans();
|
||||
|
||||
for (int i = 0; i < positionSpans.size(); i++) {
|
||||
if (((PositionSpan) positionSpans.get(i)).start == position) {
|
||||
waitForPos = ((PositionSpan) positionSpans.get(i)).end + 1;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags);
|
||||
|
||||
if (isNewFrag) {
|
||||
currentNumFrags++;
|
||||
}
|
||||
|
||||
return isNewFrag;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String)
|
||||
*/
|
||||
public void start(String originalText) {
|
||||
position = 0;
|
||||
currentNumFrags = 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
|
||||
/**
|
||||
* {@link Scorer} implementation which scores text fragments by the number of
|
||||
* unique query terms found. This class converts appropriate Querys to
|
||||
* SpanQuerys and attempts to score only those terms that participated in
|
||||
* generating the 'hit' on the document.
|
||||
*/
|
||||
public class SpanScorer implements Scorer {
|
||||
private float totalScore;
|
||||
private Set foundTerms;
|
||||
private Map fieldWeightedSpanTerms;
|
||||
private float maxTermWeight;
|
||||
private int position = -1;
|
||||
private String defaultField;
|
||||
private boolean highlightCnstScrRngQuery;
|
||||
|
||||
/**
|
||||
* @param query
|
||||
* Query to use for highlighting
|
||||
* @param field
|
||||
* Field to highlight - pass null to ignore fields
|
||||
* @param tokenStream
|
||||
* of source text to be highlighted
|
||||
* @throws IOException
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter) throws IOException {
|
||||
init(query, field, cachingTokenFilter, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param query
|
||||
* Query to use for highlighting
|
||||
* @param field
|
||||
* Field to highlight - pass null to ignore fields
|
||||
* @param tokenStream
|
||||
* of source text to be highlighted
|
||||
* @param reader
|
||||
* @throws IOException
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader)
|
||||
throws IOException {
|
||||
init(query, field, cachingTokenFilter, reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* As above, but with ability to pass in an <tt>IndexReader</tt>
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField)
|
||||
throws IOException {
|
||||
this.defaultField = defaultField.intern();
|
||||
init(query, field, cachingTokenFilter, reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param defaultField - The default field for queries with the field name unspecified
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException {
|
||||
this.defaultField = defaultField.intern();
|
||||
init(query, field, cachingTokenFilter, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param weightedTerms
|
||||
*/
|
||||
public SpanScorer(WeightedSpanTerm[] weightedTerms) {
|
||||
this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length);
|
||||
|
||||
for (int i = 0; i < weightedTerms.length; i++) {
|
||||
WeightedSpanTerm existingTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(weightedTerms[i].term);
|
||||
|
||||
if ((existingTerm == null) ||
|
||||
(existingTerm.weight < weightedTerms[i].weight)) {
|
||||
// if a term is defined more than once, always use the highest
|
||||
// scoring weight
|
||||
fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]);
|
||||
maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
|
||||
*/
|
||||
public float getFragmentScore() {
|
||||
return totalScore;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The highest weighted term (useful for passing to
|
||||
* GradientFormatter to set top end of coloring scale.
|
||||
*/
|
||||
public float getMaxTermWeight() {
|
||||
return maxTermWeight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
|
||||
* int)
|
||||
*/
|
||||
public float getTokenScore(Token token) {
|
||||
position += token.getPositionIncrement();
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
|
||||
WeightedSpanTerm weightedSpanTerm;
|
||||
|
||||
if ((weightedSpanTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(
|
||||
termText)) == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (weightedSpanTerm.positionSensitive &&
|
||||
!weightedSpanTerm.checkPosition(position)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
float score = weightedSpanTerm.getWeight();
|
||||
|
||||
// found a query term - is it unique in this doc?
|
||||
if (!foundTerms.contains(termText)) {
|
||||
totalScore += score;
|
||||
foundTerms.add(termText);
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the WeightedSpanTerm for the specified token. Useful for passing
|
||||
* Span information to a Fragmenter.
|
||||
*
|
||||
* @param token
|
||||
* @return WeightedSpanTerm for token
|
||||
*/
|
||||
public WeightedSpanTerm getWeightedSpanTerm(String token) {
|
||||
return (WeightedSpanTerm) fieldWeightedSpanTerms.get(token);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param query
|
||||
* @param field
|
||||
* @param tokenStream
|
||||
* @param reader
|
||||
* @throws IOException
|
||||
*/
|
||||
private void init(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader)
|
||||
throws IOException {
|
||||
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
|
||||
: new WeightedSpanTermExtractor(defaultField);
|
||||
|
||||
qse.setHighlightCnstScrRngQuery(highlightCnstScrRngQuery);
|
||||
|
||||
if (reader == null) {
|
||||
this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
|
||||
cachingTokenFilter, field);
|
||||
} else {
|
||||
this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query,
|
||||
cachingTokenFilter, field, reader);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether ConstantScoreRangeQuerys are set to be highlighted
|
||||
*/
|
||||
public boolean isHighlightCnstScrRngQuery() {
|
||||
return highlightCnstScrRngQuery;
|
||||
}
|
||||
|
||||
/**
|
||||
* If you call Highlighter#getBestFragment() more than once you must reset
|
||||
* the SpanScorer between each call.
|
||||
*/
|
||||
public void reset() {
|
||||
position = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Turns highlighting of ConstantScoreRangeQuery on/off. ConstantScoreRangeQuerys cannot be
|
||||
* highlighted if you rewrite the query first.
|
||||
*
|
||||
* @param highlightCnstScrRngQuery
|
||||
*/
|
||||
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
|
||||
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
|
||||
*/
|
||||
public void startFragment(TextFragment newFragment) {
|
||||
foundTerms = new HashSet();
|
||||
totalScore = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Lightweight class to hold term, weight, and positions used for scoring this
|
||||
* term.
|
||||
*/
|
||||
public class WeightedSpanTerm extends WeightedTerm{
|
||||
boolean positionSensitive;
|
||||
private List positionSpans = new ArrayList();
|
||||
|
||||
/**
|
||||
* @param weight
|
||||
* @param term
|
||||
*/
|
||||
public WeightedSpanTerm(float weight, String term) {
|
||||
super(weight, term);
|
||||
this.positionSpans = new ArrayList();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param weight
|
||||
* @param term
|
||||
* @param positionSensitive
|
||||
*/
|
||||
public WeightedSpanTerm(float weight, String term, boolean positionSensitive) {
|
||||
super(weight, term);
|
||||
this.positionSensitive = positionSensitive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks to see if this term is valid at <code>position</code>.
|
||||
*
|
||||
* @param position
|
||||
* to check against valid term postions
|
||||
* @return true iff this term is a hit at this position
|
||||
*/
|
||||
public boolean checkPosition(int position) {
|
||||
// There would probably be a slight speed improvement if PositionSpans
|
||||
// where kept in some sort of priority queue - that way this method
|
||||
// could
|
||||
// bail early without checking each PositionSpan.
|
||||
Iterator positionSpanIt = positionSpans.iterator();
|
||||
|
||||
while (positionSpanIt.hasNext()) {
|
||||
PositionSpan posSpan = (PositionSpan) positionSpanIt.next();
|
||||
|
||||
if (((position >= posSpan.start) && (position <= posSpan.end))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void addPositionSpans(List positionSpans) {
|
||||
this.positionSpans.addAll(positionSpans);
|
||||
}
|
||||
|
||||
public boolean isPositionSensitive() {
|
||||
return positionSensitive;
|
||||
}
|
||||
|
||||
public void setPositionSensitive(boolean positionSensitive) {
|
||||
this.positionSensitive = positionSensitive;
|
||||
}
|
||||
|
||||
public List getPositionSpans() {
|
||||
return positionSpans;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Utility class to store a Span
|
||||
class PositionSpan {
|
||||
int start;
|
||||
int end;
|
||||
|
||||
public PositionSpan(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,436 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.FilterIndexReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreRangeQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
|
||||
/**
|
||||
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether Terms from the query are contained in a supplied TokenStream.
|
||||
*/
|
||||
public class WeightedSpanTermExtractor {
|
||||
|
||||
private String fieldName;
|
||||
private CachingTokenFilter cachedTokenFilter;
|
||||
private Map readers = new HashMap(10); // Map<String, IndexReader>
|
||||
private String defaultField;
|
||||
private boolean highlightCnstScrRngQuery;
|
||||
|
||||
public WeightedSpanTermExtractor() {
|
||||
}
|
||||
|
||||
public WeightedSpanTermExtractor(String defaultField) {
|
||||
if (defaultField != null) {
|
||||
this.defaultField = defaultField.intern();
|
||||
}
|
||||
}
|
||||
|
||||
private void closeReaders() {
|
||||
Collection readerSet = readers.values();
|
||||
Iterator it = readerSet.iterator();
|
||||
|
||||
while (it.hasNext()) {
|
||||
IndexReader reader = (IndexReader) it.next();
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
// alert?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
||||
*
|
||||
* @param query
|
||||
* Query to extract Terms from
|
||||
* @param terms
|
||||
* Map to place created WeightedSpanTerms in
|
||||
* @throws IOException
|
||||
*/
|
||||
private void extract(Query query, Map terms) throws IOException {
|
||||
if (query instanceof BooleanQuery) {
|
||||
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
|
||||
Map booleanTerms = new HashMap();
|
||||
for (int i = 0; i < queryClauses.length; i++) {
|
||||
if (!queryClauses[i].isProhibited()) {
|
||||
extract(queryClauses[i].getQuery(), booleanTerms);
|
||||
}
|
||||
}
|
||||
terms.putAll(booleanTerms);
|
||||
} else if (query instanceof PhraseQuery) {
|
||||
Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms();
|
||||
SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
|
||||
for (int i = 0; i < phraseQueryTerms.length; i++) {
|
||||
clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
|
||||
}
|
||||
|
||||
int slop = ((PhraseQuery) query).getSlop();
|
||||
boolean inorder = false;
|
||||
|
||||
if (slop == 0) {
|
||||
inorder = true;
|
||||
}
|
||||
|
||||
SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
|
||||
sp.setBoost(query.getBoost());
|
||||
extractWeightedSpanTerms(terms, sp);
|
||||
} else if (query instanceof TermQuery) {
|
||||
extractWeightedTerms(terms, query);
|
||||
} else if (query instanceof SpanQuery) {
|
||||
extractWeightedSpanTerms(terms, (SpanQuery) query);
|
||||
} else if (query instanceof FilteredQuery) {
|
||||
extract(((FilteredQuery) query).getQuery(), terms);
|
||||
} else if (query instanceof DisjunctionMaxQuery) {
|
||||
Map disjunctTerms = new HashMap();
|
||||
for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
|
||||
extract((Query) iterator.next(), disjunctTerms);
|
||||
}
|
||||
terms.putAll(disjunctTerms);
|
||||
} else if (query instanceof MultiPhraseQuery) {
|
||||
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
|
||||
final List termArrays = mpq.getTermArrays();
|
||||
final int[] positions = mpq.getPositions();
|
||||
if (positions.length > 0) {
|
||||
|
||||
int maxPosition = positions[positions.length - 1];
|
||||
for (int i = 0; i < positions.length - 1; ++i) {
|
||||
if (positions[i] > maxPosition) {
|
||||
maxPosition = positions[i];
|
||||
}
|
||||
}
|
||||
|
||||
final List[] disjunctLists = new List[maxPosition + 1];
|
||||
int distinctPositions = 0;
|
||||
|
||||
for (int i = 0; i < termArrays.size(); ++i) {
|
||||
final Term[] termArray = (Term[]) termArrays.get(i);
|
||||
List disjuncts = disjunctLists[positions[i]];
|
||||
if (disjuncts == null) {
|
||||
disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length));
|
||||
++distinctPositions;
|
||||
}
|
||||
for (int j = 0; j < termArray.length; ++j) {
|
||||
disjuncts.add(new SpanTermQuery(termArray[j]));
|
||||
}
|
||||
}
|
||||
|
||||
int positionGaps = 0;
|
||||
int position = 0;
|
||||
final SpanQuery[] clauses = new SpanQuery[distinctPositions];
|
||||
for (int i = 0; i < disjunctLists.length; ++i) {
|
||||
List disjuncts = disjunctLists[i];
|
||||
if (disjuncts != null) {
|
||||
clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts
|
||||
.toArray(new SpanQuery[disjuncts.size()]));
|
||||
} else {
|
||||
++positionGaps;
|
||||
}
|
||||
}
|
||||
|
||||
final int slop = mpq.getSlop();
|
||||
final boolean inorder = (slop == 0);
|
||||
|
||||
SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
|
||||
sp.setBoost(query.getBoost());
|
||||
extractWeightedSpanTerms(terms, sp);
|
||||
}
|
||||
} else if (query instanceof ConstantScoreRangeQuery) {
|
||||
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
|
||||
Term lower = new Term(fieldName, q.getLowerVal());
|
||||
Term upper = new Term(fieldName, q.getUpperVal());
|
||||
FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
|
||||
try {
|
||||
TermEnum te = fir.terms(lower);
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
do {
|
||||
Term term = te.term();
|
||||
if (term != null && upper.compareTo(term) >= 0) {
|
||||
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (te.next());
|
||||
extract(bq, terms);
|
||||
} finally {
|
||||
fir.close();
|
||||
}
|
||||
} else {
|
||||
// NO-OP
|
||||
System.out.println("found none");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>SpanQuery</code>.
|
||||
*
|
||||
* @param terms
|
||||
* Map to place created WeightedSpanTerms in
|
||||
* @param spanQuery
|
||||
* SpanQuery to extract Terms from
|
||||
* @throws IOException
|
||||
*/
|
||||
private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException {
|
||||
Set nonWeightedTerms = new HashSet();
|
||||
spanQuery.extractTerms(nonWeightedTerms);
|
||||
|
||||
Set fieldNames;
|
||||
|
||||
if (fieldName == null) {
|
||||
fieldNames = new HashSet();
|
||||
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
||||
Term queryTerm = (Term) iter.next();
|
||||
fieldNames.add(queryTerm.field());
|
||||
}
|
||||
} else {
|
||||
fieldNames = new HashSet(1);
|
||||
fieldNames.add(fieldName);
|
||||
}
|
||||
// To support the use of the default field name
|
||||
if (defaultField != null) {
|
||||
fieldNames.add(defaultField);
|
||||
}
|
||||
|
||||
Iterator it = fieldNames.iterator();
|
||||
List spanPositions = new ArrayList();
|
||||
|
||||
while (it.hasNext()) {
|
||||
String field = (String) it.next();
|
||||
|
||||
IndexReader reader = getReaderForField(field);
|
||||
Spans spans = spanQuery.getSpans(reader);
|
||||
|
||||
// collect span positions
|
||||
while (spans.next()) {
|
||||
spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1));
|
||||
}
|
||||
|
||||
cachedTokenFilter.reset();
|
||||
}
|
||||
|
||||
if (spanPositions.size() == 0) {
|
||||
// no spans found
|
||||
return;
|
||||
}
|
||||
|
||||
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
||||
Term queryTerm = (Term) iter.next();
|
||||
|
||||
if (fieldNameComparator(queryTerm.field())) {
|
||||
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(queryTerm.text());
|
||||
|
||||
if (weightedSpanTerm == null) {
|
||||
weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text());
|
||||
weightedSpanTerm.addPositionSpans(spanPositions);
|
||||
weightedSpanTerm.positionSensitive = true;
|
||||
terms.put(queryTerm.text(), weightedSpanTerm);
|
||||
} else {
|
||||
if (spanPositions.size() > 0) {
|
||||
weightedSpanTerm.addPositionSpans(spanPositions);
|
||||
weightedSpanTerm.positionSensitive = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
||||
*
|
||||
* @param terms
|
||||
* Map to place created WeightedSpanTerms in
|
||||
* @param query
|
||||
* Query to extract Terms from
|
||||
* @throws IOException
|
||||
*/
|
||||
private void extractWeightedTerms(Map terms, Query query) throws IOException {
|
||||
Set nonWeightedTerms = new HashSet();
|
||||
query.extractTerms(nonWeightedTerms);
|
||||
|
||||
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
|
||||
Term queryTerm = (Term) iter.next();
|
||||
|
||||
if (fieldNameComparator(queryTerm.field())) {
|
||||
WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text());
|
||||
terms.put(queryTerm.text(), weightedSpanTerm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Necessary to implement matches for queries against <code>defaultField</code>
|
||||
*/
|
||||
private boolean fieldNameComparator(String fieldNameToCheck) {
|
||||
boolean rv = fieldName == null || fieldNameToCheck == fieldName
|
||||
|| fieldNameToCheck == defaultField;
|
||||
return rv;
|
||||
}
|
||||
|
||||
private IndexReader getReaderForField(String field) {
|
||||
IndexReader reader = (IndexReader) readers.get(field);
|
||||
if (reader == null) {
|
||||
MemoryIndex indexer = new MemoryIndex();
|
||||
indexer.addField(field, cachedTokenFilter);
|
||||
IndexSearcher searcher = indexer.createSearcher();
|
||||
reader = searcher.getIndexReader();
|
||||
readers.put(field, reader);
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
|
||||
*
|
||||
* <p>
|
||||
*
|
||||
* @param query
|
||||
* that caused hit
|
||||
* @param tokenStream
|
||||
* of text to be highlighted
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter)
|
||||
throws IOException {
|
||||
this.fieldName = null;
|
||||
this.cachedTokenFilter = cachingTokenFilter;
|
||||
|
||||
Map terms = new HashMap();
|
||||
try {
|
||||
extract(query, terms);
|
||||
} finally {
|
||||
closeReaders();
|
||||
}
|
||||
|
||||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
|
||||
*
|
||||
* <p>
|
||||
*
|
||||
* @param query
|
||||
* that caused hit
|
||||
* @param tokenStream
|
||||
* of text to be highlighted
|
||||
* @param fieldName
|
||||
* restricts Term's used based on field name
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter,
|
||||
String fieldName) throws IOException {
|
||||
if (fieldName != null) {
|
||||
this.fieldName = fieldName.intern();
|
||||
}
|
||||
|
||||
Map terms = new HashMap();
|
||||
this.cachedTokenFilter = cachingTokenFilter;
|
||||
try {
|
||||
extract(query, terms);
|
||||
} finally {
|
||||
closeReaders();
|
||||
}
|
||||
|
||||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
|
||||
* <code>IndexReader</code> to properly weight terms (for gradient highlighting).
|
||||
*
|
||||
* <p>
|
||||
*
|
||||
* @param query
|
||||
* that caused hit
|
||||
* @param tokenStream
|
||||
* of text to be highlighted
|
||||
* @param fieldName
|
||||
* restricts Term's used based on field name
|
||||
* @param reader
|
||||
* to use for scoring
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
|
||||
IndexReader reader) throws IOException {
|
||||
this.fieldName = fieldName;
|
||||
this.cachedTokenFilter = new CachingTokenFilter(tokenStream);
|
||||
|
||||
Map terms = new HashMap();
|
||||
extract(query, terms);
|
||||
|
||||
int totalNumDocs = reader.numDocs();
|
||||
Set weightedTerms = terms.keySet();
|
||||
Iterator it = weightedTerms.iterator();
|
||||
|
||||
try {
|
||||
while (it.hasNext()) {
|
||||
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
|
||||
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
|
||||
|
||||
// IDF algorithm taken from DefaultSimilarity class
|
||||
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
||||
weightedSpanTerm.weight *= idf;
|
||||
}
|
||||
} finally {
|
||||
|
||||
closeReaders();
|
||||
}
|
||||
|
||||
return terms;
|
||||
}
|
||||
|
||||
public boolean isHighlightCnstScrRngQuery() {
|
||||
return highlightCnstScrRngQuery;
|
||||
}
|
||||
|
||||
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
|
||||
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
<html>
|
||||
<body>
|
||||
<p>
|
||||
The spanscorer classes provide the Highlighter with the ability
|
||||
to only highlight the Tokens that contributed to a query match.
|
||||
The SpanScorer class is the central component and it will attempt to score Terms
|
||||
based on whether they actually participated in scoring the Query.
|
||||
</p>
|
||||
<p>
|
||||
The implementation is very similar to QueryScorer in that WeightedSpanTerms are extracted
|
||||
from the given Query and then placed in a Map. During Token scoring, Terms found in
|
||||
the Map return a score equal to their weight. The added wrinkle is that when terms are
|
||||
extracted, the sub-queries that make up the Query are converted to SpanQuery's and
|
||||
SpanQuery.getSpans() is applied to a MemoryIndex containing the TokenStream of the text to
|
||||
be highlighted if the sub-query is position sensitive. The start and end positions of the
|
||||
matching Spans are recorded with the respective WeightedSpanTerms and these positions are
|
||||
then used to filter possible Token matches during scoring.
|
||||
</p>
|
||||
<h2>Example Usage</h2>
|
||||
|
||||
<pre>
|
||||
IndexSearcher searcher = new IndexSearcher(ramDir);
|
||||
Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
|
||||
query = query.rewrite(reader); //required to expand search terms
|
||||
Hits hits = searcher.search(query);
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
|
||||
FIELD_NAME, new StringReader(text)));
|
||||
Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
|
||||
tokenStream.reset();
|
||||
|
||||
// Get 3 best fragments and seperate with a "..."
|
||||
String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
|
||||
System.out.println(result);
|
||||
}
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
If you make a call to any of the getBestFragments() methods more than once, you must call reset() on the SpanScorer
|
||||
between each call.
|
||||
</p>
|
||||
|
||||
<p>The SpanScorer class has a constructor which can use an IndexReader to derive the IDF (inverse document frequency)
|
||||
for each term in order to influence the score. This is useful for helping to extracting the most significant sections
|
||||
of a document and in supplying scores used by the GradientFormatter to color significant words more strongly.
|
||||
The SpanScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score
|
||||
which is associated with the top color.</p>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue