Commit of LUCENE-794 patch - adding phrase/span query support to highlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@652164 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2008-04-29 22:07:18 +00:00
parent 8f1feaa484
commit 35c7eb36df
7 changed files with 2219 additions and 771 deletions

View File

@ -1,27 +1,29 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="highlighter" default="default">
<project name="highlighter" default="buildHighlighter">
<description>
Hits highlighter
Hits highlighter
</description>
<import file="../contrib-build.xml"/>
<property name="memory.jar" location="../../build/contrib/memory/lucene-memory-${version}.jar"/>
<path id="classpath">
<pathelement path="${lucene.jar}"/>
<pathelement path="${memory.jar}"/>
<pathelement path="${project.classpath}"/>
</path>
<target name="buildHighlighter" depends="buildMemory,default" />
<target name="buildMemory" >
<echo>Highlighter building dependency ${memory.jar}</echo>
<ant antfile="../memory/build.xml" target="default" inheritall="false"/>
</target>
</project>

View File

@ -0,0 +1,95 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import java.util.List;
/**
* {@link Fragmenter} implementation which breaks text up into same-size
* fragments but does not split up Spans. This is a simple sample class.
*/
public class SimpleSpanFragmenter implements Fragmenter {
private static final int DEFAULT_FRAGMENT_SIZE = 100;
private int fragmentSize;
private int currentNumFrags;
private int position = -1;
private SpanScorer spanScorer;
private int waitForPos = -1;
/**
* @param spanscorer SpanScorer that was used to score hits
*/
public SimpleSpanFragmenter(SpanScorer spanscorer) {
this(spanscorer, DEFAULT_FRAGMENT_SIZE);
}
/**
* @param spanscorer SpanScorer that was used to score hits
* @param fragmentSize size in bytes of each fragment
*/
public SimpleSpanFragmenter(SpanScorer spanscorer, int fragmentSize) {
this.fragmentSize = fragmentSize;
this.spanScorer = spanscorer;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token)
*/
public boolean isNewFragment(Token token) {
position += token.getPositionIncrement();
if (waitForPos == position) {
waitForPos = -1;
} else if (waitForPos != -1) {
return false;
}
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength()));
if (wSpanTerm != null) {
List positionSpans = wSpanTerm.getPositionSpans();
for (int i = 0; i < positionSpans.size(); i++) {
if (((PositionSpan) positionSpans.get(i)).start == position) {
waitForPos = ((PositionSpan) positionSpans.get(i)).end + 1;
return true;
}
}
}
boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags);
if (isNewFrag) {
currentNumFrags++;
}
return isNewFrag;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String)
*/
public void start(String originalText) {
position = 0;
currentNumFrags = 1;
}
}

View File

@ -0,0 +1,218 @@
package org.apache.lucene.search.highlight;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
/**
* {@link Scorer} implementation which scores text fragments by the number of
* unique query terms found. This class converts appropriate Querys to
* SpanQuerys and attempts to score only those terms that participated in
* generating the 'hit' on the document.
*/
public class SpanScorer implements Scorer {
private float totalScore;
private Set foundTerms;
private Map fieldWeightedSpanTerms;
private float maxTermWeight;
private int position = -1;
private String defaultField;
private boolean highlightCnstScrRngQuery;
/**
* @param query
* Query to use for highlighting
* @param field
* Field to highlight - pass null to ignore fields
* @param tokenStream
* of source text to be highlighted
* @throws IOException
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter) throws IOException {
init(query, field, cachingTokenFilter, null);
}
/**
* @param query
* Query to use for highlighting
* @param field
* Field to highlight - pass null to ignore fields
* @param tokenStream
* of source text to be highlighted
* @param reader
* @throws IOException
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader)
throws IOException {
init(query, field, cachingTokenFilter, reader);
}
/**
* As above, but with ability to pass in an <tt>IndexReader</tt>
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField)
throws IOException {
this.defaultField = defaultField.intern();
init(query, field, cachingTokenFilter, reader);
}
/**
* @param defaultField - The default field for queries with the field name unspecified
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException {
this.defaultField = defaultField.intern();
init(query, field, cachingTokenFilter, null);
}
/**
* @param weightedTerms
*/
public SpanScorer(WeightedSpanTerm[] weightedTerms) {
this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length);
for (int i = 0; i < weightedTerms.length; i++) {
WeightedSpanTerm existingTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(weightedTerms[i].term);
if ((existingTerm == null) ||
(existingTerm.weight < weightedTerms[i].weight)) {
// if a term is defined more than once, always use the highest
// scoring weight
fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]);
maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
}
}
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
*/
public float getFragmentScore() {
return totalScore;
}
/**
*
* @return The highest weighted term (useful for passing to
* GradientFormatter to set top end of coloring scale.
*/
public float getMaxTermWeight() {
return maxTermWeight;
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
* int)
*/
public float getTokenScore(Token token) {
position += token.getPositionIncrement();
String termText = new String(token.termBuffer(), 0, token.termLength());
WeightedSpanTerm weightedSpanTerm;
if ((weightedSpanTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(
termText)) == null) {
return 0;
}
if (weightedSpanTerm.positionSensitive &&
!weightedSpanTerm.checkPosition(position)) {
return 0;
}
float score = weightedSpanTerm.getWeight();
// found a query term - is it unique in this doc?
if (!foundTerms.contains(termText)) {
totalScore += score;
foundTerms.add(termText);
}
return score;
}
/**
* Retrieve the WeightedSpanTerm for the specified token. Useful for passing
* Span information to a Fragmenter.
*
* @param token
* @return WeightedSpanTerm for token
*/
public WeightedSpanTerm getWeightedSpanTerm(String token) {
return (WeightedSpanTerm) fieldWeightedSpanTerms.get(token);
}
/**
* @param query
* @param field
* @param tokenStream
* @param reader
* @throws IOException
*/
private void init(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader)
throws IOException {
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
: new WeightedSpanTermExtractor(defaultField);
qse.setHighlightCnstScrRngQuery(highlightCnstScrRngQuery);
if (reader == null) {
this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
cachingTokenFilter, field);
} else {
this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query,
cachingTokenFilter, field, reader);
}
}
/**
* @return whether ConstantScoreRangeQuerys are set to be highlighted
*/
public boolean isHighlightCnstScrRngQuery() {
return highlightCnstScrRngQuery;
}
/**
* If you call Highlighter#getBestFragment() more than once you must reset
* the SpanScorer between each call.
*/
public void reset() {
position = -1;
}
/**
* Turns highlighting of ConstantScoreRangeQuery on/off. ConstantScoreRangeQuerys cannot be
* highlighted if you rewrite the query first.
*
* @param highlightCnstScrRngQuery
*/
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
*/
public void startFragment(TextFragment newFragment) {
foundTerms = new HashSet();
totalScore = 0;
}
}

View File

@ -0,0 +1,104 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Lightweight class to hold term, weight, and positions used for scoring this
* term.
*/
public class WeightedSpanTerm extends WeightedTerm{
boolean positionSensitive;
private List positionSpans = new ArrayList();
/**
* @param weight
* @param term
*/
public WeightedSpanTerm(float weight, String term) {
super(weight, term);
this.positionSpans = new ArrayList();
}
/**
* @param weight
* @param term
* @param positionSensitive
*/
public WeightedSpanTerm(float weight, String term, boolean positionSensitive) {
super(weight, term);
this.positionSensitive = positionSensitive;
}
/**
* Checks to see if this term is valid at <code>position</code>.
*
* @param position
* to check against valid term postions
* @return true iff this term is a hit at this position
*/
public boolean checkPosition(int position) {
// There would probably be a slight speed improvement if PositionSpans
// where kept in some sort of priority queue - that way this method
// could
// bail early without checking each PositionSpan.
Iterator positionSpanIt = positionSpans.iterator();
while (positionSpanIt.hasNext()) {
PositionSpan posSpan = (PositionSpan) positionSpanIt.next();
if (((position >= posSpan.start) && (position <= posSpan.end))) {
return true;
}
}
return false;
}
public void addPositionSpans(List positionSpans) {
this.positionSpans.addAll(positionSpans);
}
public boolean isPositionSensitive() {
return positionSensitive;
}
public void setPositionSensitive(boolean positionSensitive) {
this.positionSensitive = positionSensitive;
}
public List getPositionSpans() {
return positionSpans;
}
}
// Utility class to store a Span
class PositionSpan {
int start;
int end;
public PositionSpan(int start, int end) {
this.start = start;
this.end = end;
}
}

View File

@ -0,0 +1,436 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
/**
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether Terms from the query are contained in a supplied TokenStream.
*/
public class WeightedSpanTermExtractor {
private String fieldName;
private CachingTokenFilter cachedTokenFilter;
private Map readers = new HashMap(10); // Map<String, IndexReader>
private String defaultField;
private boolean highlightCnstScrRngQuery;
public WeightedSpanTermExtractor() {
}
public WeightedSpanTermExtractor(String defaultField) {
if (defaultField != null) {
this.defaultField = defaultField.intern();
}
}
private void closeReaders() {
Collection readerSet = readers.values();
Iterator it = readerSet.iterator();
while (it.hasNext()) {
IndexReader reader = (IndexReader) it.next();
try {
reader.close();
} catch (IOException e) {
// alert?
}
}
}
/**
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
*
* @param query
* Query to extract Terms from
* @param terms
* Map to place created WeightedSpanTerms in
* @throws IOException
*/
private void extract(Query query, Map terms) throws IOException {
if (query instanceof BooleanQuery) {
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
Map booleanTerms = new HashMap();
for (int i = 0; i < queryClauses.length; i++) {
if (!queryClauses[i].isProhibited()) {
extract(queryClauses[i].getQuery(), booleanTerms);
}
}
terms.putAll(booleanTerms);
} else if (query instanceof PhraseQuery) {
Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms();
SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
for (int i = 0; i < phraseQueryTerms.length; i++) {
clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
}
int slop = ((PhraseQuery) query).getSlop();
boolean inorder = false;
if (slop == 0) {
inorder = true;
}
SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
sp.setBoost(query.getBoost());
extractWeightedSpanTerms(terms, sp);
} else if (query instanceof TermQuery) {
extractWeightedTerms(terms, query);
} else if (query instanceof SpanQuery) {
extractWeightedSpanTerms(terms, (SpanQuery) query);
} else if (query instanceof FilteredQuery) {
extract(((FilteredQuery) query).getQuery(), terms);
} else if (query instanceof DisjunctionMaxQuery) {
Map disjunctTerms = new HashMap();
for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
extract((Query) iterator.next(), disjunctTerms);
}
terms.putAll(disjunctTerms);
} else if (query instanceof MultiPhraseQuery) {
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
final List termArrays = mpq.getTermArrays();
final int[] positions = mpq.getPositions();
if (positions.length > 0) {
int maxPosition = positions[positions.length - 1];
for (int i = 0; i < positions.length - 1; ++i) {
if (positions[i] > maxPosition) {
maxPosition = positions[i];
}
}
final List[] disjunctLists = new List[maxPosition + 1];
int distinctPositions = 0;
for (int i = 0; i < termArrays.size(); ++i) {
final Term[] termArray = (Term[]) termArrays.get(i);
List disjuncts = disjunctLists[positions[i]];
if (disjuncts == null) {
disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length));
++distinctPositions;
}
for (int j = 0; j < termArray.length; ++j) {
disjuncts.add(new SpanTermQuery(termArray[j]));
}
}
int positionGaps = 0;
int position = 0;
final SpanQuery[] clauses = new SpanQuery[distinctPositions];
for (int i = 0; i < disjunctLists.length; ++i) {
List disjuncts = disjunctLists[i];
if (disjuncts != null) {
clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts
.toArray(new SpanQuery[disjuncts.size()]));
} else {
++positionGaps;
}
}
final int slop = mpq.getSlop();
final boolean inorder = (slop == 0);
SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
sp.setBoost(query.getBoost());
extractWeightedSpanTerms(terms, sp);
}
} else if (query instanceof ConstantScoreRangeQuery) {
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
Term lower = new Term(fieldName, q.getLowerVal());
Term upper = new Term(fieldName, q.getUpperVal());
FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
try {
TermEnum te = fir.terms(lower);
BooleanQuery bq = new BooleanQuery();
do {
Term term = te.term();
if (term != null && upper.compareTo(term) >= 0) {
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
} else {
break;
}
} while (te.next());
extract(bq, terms);
} finally {
fir.close();
}
} else {
// NO-OP
System.out.println("found none");
}
}
/**
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>SpanQuery</code>.
*
* @param terms
* Map to place created WeightedSpanTerms in
* @param spanQuery
* SpanQuery to extract Terms from
* @throws IOException
*/
private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException {
Set nonWeightedTerms = new HashSet();
spanQuery.extractTerms(nonWeightedTerms);
Set fieldNames;
if (fieldName == null) {
fieldNames = new HashSet();
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
Term queryTerm = (Term) iter.next();
fieldNames.add(queryTerm.field());
}
} else {
fieldNames = new HashSet(1);
fieldNames.add(fieldName);
}
// To support the use of the default field name
if (defaultField != null) {
fieldNames.add(defaultField);
}
Iterator it = fieldNames.iterator();
List spanPositions = new ArrayList();
while (it.hasNext()) {
String field = (String) it.next();
IndexReader reader = getReaderForField(field);
Spans spans = spanQuery.getSpans(reader);
// collect span positions
while (spans.next()) {
spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1));
}
cachedTokenFilter.reset();
}
if (spanPositions.size() == 0) {
// no spans found
return;
}
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
Term queryTerm = (Term) iter.next();
if (fieldNameComparator(queryTerm.field())) {
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(queryTerm.text());
if (weightedSpanTerm == null) {
weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text());
weightedSpanTerm.addPositionSpans(spanPositions);
weightedSpanTerm.positionSensitive = true;
terms.put(queryTerm.text(), weightedSpanTerm);
} else {
if (spanPositions.size() > 0) {
weightedSpanTerm.addPositionSpans(spanPositions);
weightedSpanTerm.positionSensitive = true;
}
}
}
}
}
/**
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
*
* @param terms
* Map to place created WeightedSpanTerms in
* @param query
* Query to extract Terms from
* @throws IOException
*/
private void extractWeightedTerms(Map terms, Query query) throws IOException {
Set nonWeightedTerms = new HashSet();
query.extractTerms(nonWeightedTerms);
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
Term queryTerm = (Term) iter.next();
if (fieldNameComparator(queryTerm.field())) {
WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text());
terms.put(queryTerm.text(), weightedSpanTerm);
}
}
}
/**
* Necessary to implement matches for queries against <code>defaultField</code>
*/
private boolean fieldNameComparator(String fieldNameToCheck) {
boolean rv = fieldName == null || fieldNameToCheck == fieldName
|| fieldNameToCheck == defaultField;
return rv;
}
private IndexReader getReaderForField(String field) {
IndexReader reader = (IndexReader) readers.get(field);
if (reader == null) {
MemoryIndex indexer = new MemoryIndex();
indexer.addField(field, cachedTokenFilter);
IndexSearcher searcher = indexer.createSearcher();
reader = searcher.getIndexReader();
readers.put(field, reader);
}
return reader;
}
/**
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
*
* <p>
*
* @param query
* that caused hit
* @param tokenStream
* of text to be highlighted
* @return
* @throws IOException
*/
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter)
throws IOException {
this.fieldName = null;
this.cachedTokenFilter = cachingTokenFilter;
Map terms = new HashMap();
try {
extract(query, terms);
} finally {
closeReaders();
}
return terms;
}
/**
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
*
* <p>
*
* @param query
* that caused hit
* @param tokenStream
* of text to be highlighted
* @param fieldName
* restricts Term's used based on field name
* @return
* @throws IOException
*/
public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter,
String fieldName) throws IOException {
if (fieldName != null) {
this.fieldName = fieldName.intern();
}
Map terms = new HashMap();
this.cachedTokenFilter = cachingTokenFilter;
try {
extract(query, terms);
} finally {
closeReaders();
}
return terms;
}
/**
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
* <code>IndexReader</code> to properly weight terms (for gradient highlighting).
*
* <p>
*
* @param query
* that caused hit
* @param tokenStream
* of text to be highlighted
* @param fieldName
* restricts Term's used based on field name
* @param reader
* to use for scoring
* @return
* @throws IOException
*/
public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
IndexReader reader) throws IOException {
this.fieldName = fieldName;
this.cachedTokenFilter = new CachingTokenFilter(tokenStream);
Map terms = new HashMap();
extract(query, terms);
int totalNumDocs = reader.numDocs();
Set weightedTerms = terms.keySet();
Iterator it = weightedTerms.iterator();
try {
while (it.hasNext()) {
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
// IDF algorithm taken from DefaultSimilarity class
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
weightedSpanTerm.weight *= idf;
}
} finally {
closeReaders();
}
return terms;
}
public boolean isHighlightCnstScrRngQuery() {
return highlightCnstScrRngQuery;
}
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
}
}

View File

@ -0,0 +1,54 @@
<html>
<body>
<p>
The spanscorer classes provide the Highlighter with the ability
to only highlight the Tokens that contributed to a query match.
The SpanScorer class is the central component and it will attempt to score Terms
based on whether they actually participated in scoring the Query.
</p>
<p>
The implementation is very similar to QueryScorer in that WeightedSpanTerms are extracted
from the given Query and then placed in a Map. During Token scoring, Terms found in
the Map return a score equal to their weight. The added wrinkle is that when terms are
extracted, the sub-queries that make up the Query are converted to SpanQuery's and
SpanQuery.getSpans() is applied to a MemoryIndex containing the TokenStream of the text to
be highlighted if the sub-query is position sensitive. The start and end positions of the
matching Spans are recorded with the respective WeightedSpanTerms and these positions are
then used to filter possible Token matches during scoring.
</p>
<h2>Example Usage</h2>
<pre>
IndexSearcher searcher = new IndexSearcher(ramDir);
Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
query = query.rewrite(reader); //required to expand search terms
Hits hits = searcher.search(query);
for (int i = 0; i &lt; hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
FIELD_NAME, new StringReader(text)));
Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
tokenStream.reset();
// Get 3 best fragments and seperate with a "..."
String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
System.out.println(result);
}
</pre>
<p>
If you make a call to any of the getBestFragments() methods more than once, you must call reset() on the SpanScorer
between each call.
</p>
<p>The SpanScorer class has a constructor which can use an IndexReader to derive the IDF (inverse document frequency)
for each term in order to influence the score. This is useful for helping to extracting the most significant sections
of a document and in supplying scores used by the GradientFormatter to color significant words more strongly.
The SpanScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score
which is associated with the top color.</p>
</body>
</html>