SOLR-386: configurable SolrHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@639490 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mike Klaas 2008-03-20 22:39:27 +00:00
parent 08a254c5d3
commit 52e60f089a
7 changed files with 944 additions and 437 deletions

View File

@ -558,6 +558,11 @@ New Features
of constructors that takes an ErrorCode enum. This will ensure that
all SolrExceptions use a valid HTTP status code. (ryan)
36. SOLR-386: Abstracted SolrHighlighter and moved existing implementation
to DefaultSolrHighlighter. Adjusted SolrCore and solrconfig.xml so
that highlighter is configurable via a class attribute. Allows users
to use their own highlighter implementation. (Tricia Williams via klaas)
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).

View File

@ -49,6 +49,7 @@ import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.handler.component.MoreLikeThisComponent;
import org.apache.solr.handler.component.QueryComponent;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.highlight.DefaultSolrHighlighter;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.JSONResponseWriter;
import org.apache.solr.request.PythonResponseWriter;
@ -306,6 +307,9 @@ public final class SolrCore {
return createInstance(className, UpdateHandler.class, "Update Handler");
}
private SolrHighlighter createHighlighter(String className) {
return createInstance(className, SolrHighlighter.class, "Highlighter");
}
/**
* @return the last core initialized. If you are using multiple cores,
@ -379,8 +383,9 @@ public final class SolrCore {
reqHandlers = new RequestHandlers(this);
reqHandlers.initHandlersFromConfig( solrConfig );
// TODO? could select the highlighter implementation
highlighter = new SolrHighlighter();
highlighter = createHighlighter(
solrConfig.get("highlighting/@class", DefaultSolrHighlighter.class.getName())
);
highlighter.initalize( solrConfig );
try {

View File

@ -0,0 +1,412 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import javax.xml.xpath.XPathConstants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.Config;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListPluginLoader;
import org.w3c.dom.NodeList;
/**
*
* @since solr 1.3
*/
public class DefaultSolrHighlighter extends SolrHighlighter
{
public void initalize( final Config config )
{
formatters.clear();
fragmenters.clear();
// Load the fragmenters
String xpath = "highlighting/fragmenter";
NamedListPluginLoader<SolrFragmenter> fragloader = new NamedListPluginLoader<SolrFragmenter>( xpath, fragmenters );
SolrFragmenter frag = fragloader.load( config.getResourceLoader(), (NodeList)config.evaluate( xpath, XPathConstants.NODESET ) );
if( frag == null ) {
frag = new GapFragmenter();
}
fragmenters.put( "", frag );
fragmenters.put( null, frag );
// Load the formatters
xpath = "highlighting/formatter";
NamedListPluginLoader<SolrFormatter> fmtloader = new NamedListPluginLoader<SolrFormatter>( xpath, formatters );
SolrFormatter fmt = fmtloader.load( config.getResourceLoader(), (NodeList)config.evaluate( xpath, XPathConstants.NODESET ) );
if( fmt == null ) {
fmt = new HtmlFormatter();
}
formatters.put( "", fmt );
formatters.put( null, fmt );
}
/**
* Return a Highlighter appropriate for this field.
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
*/
protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) {
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
highlighter.setMaxDocBytesToAnalyze(params.getFieldInt(
fieldName, HighlightParams.MAX_CHARS,
Highlighter.DEFAULT_MAX_DOC_BYTES_TO_ANALYZE));
return highlighter;
}
/**
* Return a QueryScorer suitable for this Query and field.
* @param query The current query
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
protected QueryScorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryScorer(query, request.getSearcher().getReader(), fieldName);
}
else {
return new QueryScorer(query);
}
}
/**
* Return the max number of snippets for this field. If this has not
* been configured for this field, fall back to the configured default
* or the solr default.
* @param fieldName The name of the field
* @param params The params controlling Highlighting
*/
protected int getMaxSnippets(String fieldName, SolrParams params) {
return params.getFieldInt(fieldName, HighlightParams.SNIPPETS,1);
}
/**
* Return whether adjacent fragments should be merged.
* @param fieldName The name of the field
* @param params The params controlling Highlighting
*/
protected boolean isMergeContiguousFragments(String fieldName, SolrParams params){
return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false);
}
/**
* Return a formatter appropriate for this field. If a formatter
* has not been configured for this field, fall back to the configured
* default or the solr default (SimpleHTMLFormatter).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate Formatter.
*/
protected Formatter getFormatter(String fieldName, SolrParams params )
{
String str = params.getFieldParam( fieldName, HighlightParams.FORMATTER );
SolrFormatter formatter = formatters.get( str );
if( formatter == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: "+str );
}
return formatter.getFormatter( fieldName, params );
}
/**
* Return a fragmenter appropriate for this field. If a fragmenter
* has not been configured for this field, fall back to the configured
* default or the solr default (GapFragmenter).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate Fragmenter.
*/
protected Fragmenter getFragmenter(String fieldName, SolrParams params)
{
String fmt = params.getFieldParam( fieldName, HighlightParams.FRAGMENTER );
SolrFragmenter frag = fragmenters.get( fmt );
if( frag == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: "+fmt );
}
return frag.getFragmenter( fieldName, params );
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents, or returns null if highlighting is disabled.
*
* @param docs query results
* @param query the query
* @param req the current request
* @param defaultFields default list of fields to summarize
*
* @return NamedList containing a NamedList for each document, which in
* turns contains sets (field, summary) pairs.
*/
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
if (!isHighlightingEnabled(params))
return null;
SolrIndexSearcher searcher = req.getSearcher();
IndexSchema schema = searcher.getSchema();
NamedList fragments = new SimpleOrderedMap();
String[] fieldNames = getHighlightFields(query, req, defaultFields);
Document[] readDocs = new Document[docs.size()];
{
// pre-fetch documents using the Searcher's doc cache
Set<String> fset = new HashSet<String>();
for(String f : fieldNames) { fset.add(f); }
// fetch unique key if one exists.
SchemaField keyField = schema.getUniqueKeyField();
if(null != keyField)
fset.add(keyField.getName());
searcher.readDocs(readDocs, docs, fset);
}
// Highlight each document
DocIterator iterator = docs.iterator();
for (int i = 0; i < docs.size(); i++) {
int docId = iterator.nextDoc();
Document doc = readDocs[i];
NamedList docSummaries = new SimpleOrderedMap();
for (String fieldName : fieldNames) {
fieldName = fieldName.trim();
String[] docTexts = doc.getValues(fieldName);
if (docTexts == null) continue;
// get highlighter, and number of fragments for this field
Highlighter highlighter = getHighlighter(query, fieldName, req);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
String[] summaries = null;
TextFragment[] frag;
if (docTexts.length == 1) {
// single-valued field
TokenStream tstream;
try {
// attempt term vectors
tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName);
}
catch (IllegalArgumentException e) {
// fall back to analyzer
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
}
frag = highlighter.getBestTextFragments(tstream, docTexts[0], mergeContiguousFragments, numFragments);
}
else {
// multi-valued field
MultiValueTokenStream tstream;
tstream = new MultiValueTokenStream(fieldName, docTexts, schema.getAnalyzer(), true);
frag = highlighter.getBestTextFragments(tstream, tstream.asSingleValue(), false, numFragments);
}
// convert fragments back into text
// TODO: we can include score and position information in output as snippet attributes
if (frag.length > 0) {
ArrayList<String> fragTexts = new ArrayList<String>();
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
fragTexts.add(frag[j].toString());
}
}
summaries = fragTexts.toArray(new String[0]);
if (summaries.length > 0)
docSummaries.add(fieldName, summaries);
}
// no summeries made, copy text from alternate field
if (summaries == null || summaries.length == 0) {
String alternateField = req.getParams().getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD);
if (alternateField != null && alternateField.length() > 0) {
String[] altTexts = doc.getValues(alternateField);
if (altTexts != null && altTexts.length > 0)
docSummaries.add(fieldName, altTexts);
}
}
}
String printId = schema.printableUniqueKey(doc);
fragments.add(printId == null ? null : printId, docSummaries);
}
return fragments;
}
}
/**
* Helper class which creates a single TokenStream out of values from a
* multi-valued field.
*/
class MultiValueTokenStream extends TokenStream {
private String fieldName;
private String[] values;
private Analyzer analyzer;
private int curIndex; // next index into the values array
private int curOffset; // offset into concatenated string
private TokenStream currentStream; // tokenStream currently being iterated
private boolean orderTokenOffsets;
/** Constructs a TokenStream for consecutively-analyzed field values
*
* @param fieldName name of the field
* @param values array of field data
* @param analyzer analyzer instance
*/
public MultiValueTokenStream(String fieldName, String[] values,
Analyzer analyzer, boolean orderTokenOffsets) {
this.fieldName = fieldName;
this.values = values;
this.analyzer = analyzer;
curIndex = -1;
curOffset = 0;
currentStream = null;
this.orderTokenOffsets=orderTokenOffsets;
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public Token next() throws IOException {
int extra = 0;
if(currentStream == null) {
curIndex++;
if(curIndex < values.length) {
currentStream = analyzer.tokenStream(fieldName,
new StringReader(values[curIndex]));
if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10);
// add extra space between multiple values
if(curIndex > 0)
extra = analyzer.getPositionIncrementGap(fieldName);
} else {
return null;
}
}
Token nextToken = currentStream.next();
if(nextToken == null) {
curOffset += values[curIndex].length();
currentStream = null;
return next();
}
// create an modified token which is the offset into the concatenated
// string of all values
Token offsetToken = new Token(nextToken.termText(),
nextToken.startOffset() + curOffset,
nextToken.endOffset() + curOffset);
offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10);
return offsetToken;
}
/**
* Returns all values as a single String into which the Tokens index with
* their offsets.
*/
public String asSingleValue() {
StringBuilder sb = new StringBuilder();
for(String str : values)
sb.append(str);
return sb.toString();
}
}
/** Orders Tokens in a window first by their startOffset ascending.
* endOffset is currently ignored.
* This is meant to work around fickleness in the highlighter only. It
* can mess up token positions and should not be used for indexing or querying.
*/
class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
private final LinkedList<Token> queue = new LinkedList<Token>();
private boolean done=false;
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;
}
@Override
public Token next() throws IOException {
while (!done && queue.size() < windowSize) {
Token newTok = input.next();
if (newTok==null) {
done=true;
break;
}
// reverse iterating for better efficiency since we know the
// list is already sorted, and most token start offsets will be too.
ListIterator<Token> iter = queue.listIterator(queue.size());
while(iter.hasPrevious()) {
if (newTok.startOffset() >= iter.previous().startOffset()) {
// insertion will be before what next() would return (what
// we just compared against), so move back one so the insertion
// will be after.
iter.next();
break;
}
}
iter.add(newTok);
}
return queue.isEmpty() ? null : queue.removeFirst();
}
}

View File

@ -1,68 +1,26 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import javax.xml.xpath.XPathConstants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.Config;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListPluginLoader;
import org.w3c.dom.NodeList;
/**
*
* @since solr 1.3
*/
public class SolrHighlighter
public abstract class SolrHighlighter
{
public static Logger log = Logger.getLogger(SolrHighlighter.class.getName());
@ -74,31 +32,7 @@ public class SolrHighlighter
protected final Map<String,SolrFragmenter> fragmenters =
Collections.synchronizedMap( new HashMap<String, SolrFragmenter>() );
public void initalize( final Config config )
{
formatters.clear();
fragmenters.clear();
// Load the fragmenters
String xpath = "highlighting/fragmenter";
NamedListPluginLoader<SolrFragmenter> fragloader = new NamedListPluginLoader<SolrFragmenter>( xpath, fragmenters );
SolrFragmenter frag = fragloader.load( config.getResourceLoader(), (NodeList)config.evaluate( xpath, XPathConstants.NODESET ) );
if( frag == null ) {
frag = new GapFragmenter();
}
fragmenters.put( "", frag );
fragmenters.put( null, frag );
// Load the formatters
xpath = "highlighting/formatter";
NamedListPluginLoader<SolrFormatter> fmtloader = new NamedListPluginLoader<SolrFormatter>( xpath, formatters );
SolrFormatter fmt = fmtloader.load( config.getResourceLoader(), (NodeList)config.evaluate( xpath, XPathConstants.NODESET ) );
if( fmt == null ) {
fmt = new HtmlFormatter();
}
formatters.put( "", fmt );
formatters.put( null, fmt );
}
public abstract void initalize( final Config config );
/**
@ -110,40 +44,6 @@ public class SolrHighlighter
return params.getBool(HighlightParams.HIGHLIGHT, false);
}
/**
* Return a Highlighter appropriate for this field.
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
*/
protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) {
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
highlighter.setMaxDocBytesToAnalyze(params.getFieldInt(
fieldName, HighlightParams.MAX_CHARS,
Highlighter.DEFAULT_MAX_DOC_BYTES_TO_ANALYZE));
return highlighter;
}
/**
* Return a QueryScorer suitable for this Query and field.
* @param query The current query
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
protected QueryScorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryScorer(query, request.getSearcher().getReader(), fieldName);
}
else {
return new QueryScorer(query);
}
}
/**
* Return a String array of the fields to be highlighted.
* Falls back to the programatic defaults, or the default search field if the list of fields
@ -159,7 +59,7 @@ public class SolrHighlighter
if(emptyArray(fields)) {
// use default search field if highlight fieldlist not specified.
if (emptyArray(defaultFields)) {
String defaultSearchField = request.getSchema().getDefaultSearchFieldName();
String defaultSearchField = request.getSchema().getSolrQueryParser(null).getField();
fields = null == defaultSearchField ? new String[]{} : new String[]{defaultSearchField};
}
else {
@ -178,64 +78,6 @@ public class SolrHighlighter
return (arr == null || arr.length == 0 || arr[0] == null || arr[0].trim().length() == 0);
}
/**
* Return the max number of snippets for this field. If this has not
* been configured for this field, fall back to the configured default
* or the solr default.
* @param fieldName The name of the field
* @param params The params controlling Highlighting
*/
protected int getMaxSnippets(String fieldName, SolrParams params) {
return params.getFieldInt(fieldName, HighlightParams.SNIPPETS,1);
}
/**
* Return whether adjacent fragments should be merged.
* @param fieldName The name of the field
* @param params The params controlling Highlighting
*/
protected boolean isMergeContiguousFragments(String fieldName, SolrParams params){
return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false);
}
/**
* Return a formatter appropriate for this field. If a formatter
* has not been configured for this field, fall back to the configured
* default or the solr default (SimpleHTMLFormatter).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate Formatter.
*/
protected Formatter getFormatter(String fieldName, SolrParams params )
{
String str = params.getFieldParam( fieldName, HighlightParams.FORMATTER );
SolrFormatter formatter = formatters.get( str );
if( formatter == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: "+str );
}
return formatter.getFormatter( fieldName, params );
}
/**
* Return a fragmenter appropriate for this field. If a fragmenter
* has not been configured for this field, fall back to the configured
* default or the solr default (GapFragmenter).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate Fragmenter.
*/
protected Fragmenter getFragmenter(String fieldName, SolrParams params)
{
String fmt = params.getFieldParam( fieldName, HighlightParams.FRAGMENTER );
SolrFragmenter frag = fragmenters.get( fmt );
if( frag == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: "+fmt );
}
return frag.getFragmenter( fieldName, params );
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents, or returns null if highlighting is disabled.
@ -249,216 +91,5 @@ public class SolrHighlighter
* turns contains sets (field, summary) pairs.
*/
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
if (!isHighlightingEnabled(params))
return null;
SolrIndexSearcher searcher = req.getSearcher();
IndexSchema schema = searcher.getSchema();
NamedList fragments = new SimpleOrderedMap();
String[] fieldNames = getHighlightFields(query, req, defaultFields);
Document[] readDocs = new Document[docs.size()];
{
// pre-fetch documents using the Searcher's doc cache
Set<String> fset = new HashSet<String>();
for(String f : fieldNames) { fset.add(f); }
// fetch unique key if one exists.
SchemaField keyField = schema.getUniqueKeyField();
if(null != keyField)
fset.add(keyField.getName());
searcher.readDocs(readDocs, docs, fset);
public abstract NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException;
}
// Highlight each document
DocIterator iterator = docs.iterator();
for (int i = 0; i < docs.size(); i++) {
int docId = iterator.nextDoc();
Document doc = readDocs[i];
NamedList docSummaries = new SimpleOrderedMap();
for (String fieldName : fieldNames) {
fieldName = fieldName.trim();
String[] docTexts = doc.getValues(fieldName);
if (docTexts == null) continue;
// get highlighter, and number of fragments for this field
Highlighter highlighter = getHighlighter(query, fieldName, req);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
String[] summaries = null;
TextFragment[] frag;
if (docTexts.length == 1) {
// single-valued field
TokenStream tstream;
try {
// attempt term vectors
tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName);
}
catch (IllegalArgumentException e) {
// fall back to analyzer
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
}
frag = highlighter.getBestTextFragments(tstream, docTexts[0], mergeContiguousFragments, numFragments);
}
else {
// multi-valued field
MultiValueTokenStream tstream;
tstream = new MultiValueTokenStream(fieldName, docTexts, schema.getAnalyzer(), true);
frag = highlighter.getBestTextFragments(tstream, tstream.asSingleValue(), false, numFragments);
}
// convert fragments back into text
// TODO: we can include score and position information in output as snippet attributes
if (frag.length > 0) {
ArrayList<String> fragTexts = new ArrayList<String>();
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
fragTexts.add(frag[j].toString());
}
}
summaries = fragTexts.toArray(new String[0]);
if (summaries.length > 0)
docSummaries.add(fieldName, summaries);
}
// no summeries made, copy text from alternate field
if (summaries == null || summaries.length == 0) {
String alternateField = req.getParams().getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD);
if (alternateField != null && alternateField.length() > 0) {
String[] altTexts = doc.getValues(alternateField);
if (altTexts != null && altTexts.length > 0)
docSummaries.add(fieldName, altTexts);
}
}
}
String printId = schema.printableUniqueKey(doc);
fragments.add(printId == null ? null : printId, docSummaries);
}
return fragments;
}
}
/**
* Helper class which creates a single TokenStream out of values from a
* multi-valued field.
*/
class MultiValueTokenStream extends TokenStream {
private String fieldName;
private String[] values;
private Analyzer analyzer;
private int curIndex; // next index into the values array
private int curOffset; // offset into concatenated string
private TokenStream currentStream; // tokenStream currently being iterated
private boolean orderTokenOffsets;
/** Constructs a TokenStream for consecutively-analyzed field values
*
* @param fieldName name of the field
* @param values array of field data
* @param analyzer analyzer instance
*/
public MultiValueTokenStream(String fieldName, String[] values,
Analyzer analyzer, boolean orderTokenOffsets) {
this.fieldName = fieldName;
this.values = values;
this.analyzer = analyzer;
curIndex = -1;
curOffset = 0;
currentStream = null;
this.orderTokenOffsets=orderTokenOffsets;
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public Token next() throws IOException {
int extra = 0;
if(currentStream == null) {
curIndex++;
if(curIndex < values.length) {
currentStream = analyzer.tokenStream(fieldName,
new StringReader(values[curIndex]));
if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10);
// add extra space between multiple values
if(curIndex > 0)
extra = analyzer.getPositionIncrementGap(fieldName);
} else {
return null;
}
}
Token nextToken = currentStream.next();
if(nextToken == null) {
curOffset += values[curIndex].length();
currentStream = null;
return next();
}
// create an modified token which is the offset into the concatenated
// string of all values
Token offsetToken = new Token(nextToken.termText(),
nextToken.startOffset() + curOffset,
nextToken.endOffset() + curOffset);
offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10);
return offsetToken;
}
/**
* Returns all values as a single String into which the Tokens index with
* their offsets.
*/
public String asSingleValue() {
StringBuilder sb = new StringBuilder();
for(String str : values)
sb.append(str);
return sb.toString();
}
}
/** Orders Tokens in a window first by their startOffset ascending.
* endOffset is currently ignored.
* This is meant to work around fickleness in the highlighter only. It
* can mess up token positions and should not be used for indexing or querying.
*/
class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
private final LinkedList<Token> queue = new LinkedList<Token>();
private boolean done=false;
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;
}
@Override
public Token next() throws IOException {
while (!done && queue.size() < windowSize) {
Token newTok = input.next();
if (newTok==null) {
done=true;
break;
}
// reverse iterating for better efficiency since we know the
// list is already sorted, and most token start offsets will be too.
ListIterator<Token> iter = queue.listIterator(queue.size());
while(iter.hasPrevious()) {
if (newTok.startOffset() >= iter.previous().startOffset()) {
// insertion will be before what next() would return (what
// we just compared against), so move back one so the insertion
// will be after.
iter.next();
break;
}
}
iter.add(newTok);
}
return queue.isEmpty() ? null : queue.removeFirst();
}
}

View File

@ -0,0 +1,27 @@
package org.apache.solr.highlight;
import java.io.IOException;
import org.apache.lucene.search.Query;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.Config;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
public class DummyHighlighter extends SolrHighlighter {
@Override
public NamedList<Object> doHighlighting(DocList docs, Query query,
SolrQueryRequest req, String[] defaultFields) throws IOException {
NamedList fragments = new SimpleOrderedMap();
fragments.add("dummy", "thing1");
return fragments;
}
@Override
public void initalize(Config config) {
// do nothing
}
}

View File

@ -0,0 +1,61 @@
package org.apache.solr.highlight;
import java.io.IOException;
import java.util.HashMap;
import org.apache.lucene.search.Query;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.Config;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
public class HighlighterConfigTest extends AbstractSolrTestCase {
@Override public String getSchemaFile() { return "schema.xml"; }
// the default case (i.e. <highlight> without a class attribute) is tested every time sorlconfig.xml is used
@Override public String getSolrConfigFile() { return "solrconfig-highlight.xml"; }
@Override
public void setUp() throws Exception {
// if you override setUp or tearDown, you better call
// the super classes version
super.setUp();
}
@Override
public void tearDown() throws Exception {
// if you override setUp or tearDown, you better call
// the super classes version
super.tearDown();
}
public void testConfig()
{
SolrHighlighter highlighter = SolrCore.getSolrCore().getHighlighter();
System.out.println( "highlighter" );
assertTrue( highlighter instanceof DummyHighlighter );
// check to see that doHighlight is called from the DummyHighlighter
HashMap<String,String> args = new HashMap<String,String>();
args.put("hl", "true");
args.put("df", "t_text");
args.put("hl.fl", "");
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertU(adoc("t_text", "a long day's night", "id", "1"));
assertU(commit());
assertU(optimize());
assertQ("Basic summarization",
sumLRF.makeRequest("long"),
"//lst[@name='highlighting']/str[@name='dummy']"
);
}
}

View File

@ -0,0 +1,366 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- $Id: solrconfig.xml 382610 2006-03-03 01:43:03Z yonik $
$Source$
$Name$
-->
<config>
<!-- Used to specify an alternate directory to hold all index data.
It defaults to "index" if not present, and should probably
not be changed if replication is in use. -->
<dataDir>${solr.data.dir:./solr/data}</dataDir>
<indexDefaults>
<!-- Values here affect all index writers and act as a default
unless overridden. -->
<!-- Values here affect all index writers and act as a default unless overridden. -->
<useCompoundFile>false</useCompoundFile>
<mergeFactor>10</mergeFactor>
<!-- If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
-->
<!--<maxBufferedDocs>1000</maxBufferedDocs>-->
<!-- Tell Lucene when to flush documents to disk.
Giving Lucene more memory for indexing means faster indexing at the cost of more RAM
If both ramBufferSizeMB and maxBufferedDocs is set, then Lucene will flush based on whichever limit is hit first.
-->
<ramBufferSizeMB>32</ramBufferSizeMB>
<maxMergeDocs>2147483647</maxMergeDocs>
<maxFieldLength>10000</maxFieldLength>
<writeLockTimeout>1000</writeLockTimeout>
<commitLockTimeout>10000</commitLockTimeout>
<!--
Expert: Turn on Lucene's auto commit capability.
NOTE: Despite the name, this value does not have any relation to Solr's autoCommit functionality
-->
<luceneAutoCommit>false</luceneAutoCommit>
<!--
Expert:
The Merge Policy in Lucene controls how merging is handled by Lucene. The default in 2.3 is the LogByteSizeMergePolicy, previous
versions used LogDocMergePolicy.
LogByteSizeMergePolicy chooses segments to merge based on their size. The Lucene 2.2 default, LogDocMergePolicy chose when
to merge based on number of documents
Other implementations of MergePolicy must have a no-argument constructor
-->
<mergePolicy>org.apache.lucene.index.LogByteSizeMergePolicy</mergePolicy>
<!--
Expert:
The Merge Scheduler in Lucene controls how merges are performed. The ConcurrentMergeScheduler (Lucene 2.3 default)
can perform merges in the background using separate threads. The SerialMergeScheduler (Lucene 2.2 default) does not.
-->
<mergeScheduler>org.apache.lucene.index.ConcurrentMergeScheduler</mergeScheduler>
<!-- these are global... can't currently override per index -->
<writeLockTimeout>1000</writeLockTimeout>
<commitLockTimeout>10000</commitLockTimeout>
<lockType>single</lockType>
</indexDefaults>
<mainIndex>
<!-- lucene options specific to the main on-disk lucene index -->
<useCompoundFile>false</useCompoundFile>
<mergeFactor>10</mergeFactor>
<ramBufferSizeMB>32</ramBufferSizeMB>
<maxMergeDocs>2147483647</maxMergeDocs>
<maxFieldLength>10000</maxFieldLength>
<unlockOnStartup>true</unlockOnStartup>
</mainIndex>
<updateHandler class="solr.DirectUpdateHandler2">
<!-- autocommit pending docs if certain criteria are met
<autoCommit>
<maxDocs>10000</maxDocs>
<maxTime>3600000</maxTime>
</autoCommit>
-->
<!-- represents a lower bound on the frequency that commits may
occur (in seconds). NOTE: not yet implemented
<commitIntervalLowerBound>0</commitIntervalLowerBound>
-->
<!-- The RunExecutableListener executes an external command.
exe - the name of the executable to run
dir - dir to use as the current working directory. default="."
wait - the calling thread waits until the executable returns. default="true"
args - the arguments to pass to the program. default=nothing
env - environment variables to set. default=nothing
-->
<!-- A postCommit event is fired after every commit
<listener event="postCommit" class="solr.RunExecutableListener">
<str name="exe">/var/opt/resin3/__PORT__/scripts/solr/snapshooter</str>
<str name="dir">/var/opt/resin3/__PORT__</str>
<bool name="wait">true</bool>
<arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
<arr name="env"> <str>MYVAR=val1</str> </arr>
</listener>
-->
</updateHandler>
<query>
<!-- Maximum number of clauses in a boolean query... can affect
range or wildcard queries that expand to big boolean
queries. An exception is thrown if exceeded.
-->
<maxBooleanClauses>1024</maxBooleanClauses>
<!-- Cache specification for Filters or DocSets - unordered set of *all* documents
that match a particular query.
-->
<filterCache
class="solr.search.LRUCache"
size="512"
initialSize="512"
autowarmCount="256"/>
<queryResultCache
class="solr.search.LRUCache"
size="512"
initialSize="512"
autowarmCount="1024"/>
<documentCache
class="solr.search.LRUCache"
size="512"
initialSize="512"
autowarmCount="0"/>
<!-- If true, stored fields that are not requested will be loaded lazily.
-->
<enableLazyFieldLoading>true</enableLazyFieldLoading>
<!--
<cache name="myUserCache"
class="solr.search.LRUCache"
size="4096"
initialSize="1024"
autowarmCount="1024"
regenerator="MyRegenerator"
/>
-->
<useFilterForSortedQuery>true</useFilterForSortedQuery>
<queryResultWindowSize>10</queryResultWindowSize>
<!-- set maxSize artificially low to exercise both types of sets -->
<HashDocSet maxSize="3" loadFactor="0.75"/>
<!-- boolToFilterOptimizer converts boolean clauses with zero boost
into cached filters if the number of docs selected by the clause exceeds
the threshold (represented as a fraction of the total index)
-->
<boolTofilterOptimizer enabled="false" cacheSize="32" threshold=".05"/>
<!-- a newSearcher event is fired whenever a new searcher is being prepared
and there is a current searcher handling requests (aka registered). -->
<!-- QuerySenderListener takes an array of NamedList and executes a
local query request for each NamedList in sequence. -->
<!--
<listener event="newSearcher" class="solr.QuerySenderListener">
<arr name="queries">
<lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
<lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
</arr>
</listener>
-->
<!-- a firstSearcher event is fired whenever a new searcher is being
prepared but there is no current registered searcher to handle
requests or to gain prewarming data from. -->
<!--
<listener event="firstSearcher" class="solr.QuerySenderListener">
<arr name="queries">
<lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
</arr>
</listener>
-->
</query>
<!-- An alternate set representation that uses an integer hash to store filters (sets of docids).
If the set cardinality <= maxSize elements, then HashDocSet will be used instead of the bitset
based HashBitset. -->
<!-- requestHandler plugins... incoming queries will be dispatched to the
correct handler based on the qt (query type) param matching the
name of registered handlers.
The "standard" request handler is the default and will be used if qt
is not specified in the request.
-->
<requestHandler name="standard" class="solr.StandardRequestHandler"/>
<requestHandler name="dismaxOldStyleDefaults"
class="solr.DisMaxRequestHandler" >
<!-- for historic reasons, DisMaxRequestHandler will use all of
it's init params as "defaults" if there is no "defaults" list
specified
-->
<float name="tie">0.01</float>
<str name="qf">
text^0.5 features_t^1.0 subject^1.4 title_stemmed^2.0
</str>
<str name="pf">
text^0.2 features_t^1.1 subject^1.4 title_stemmed^2.0 title^1.5
</str>
<str name="bf">
ord(weight)^0.5 recip(rord(iind),1,1000,1000)^0.3
</str>
<str name="mm">
3&lt;-1 5&lt;-2 6&lt;90%
</str>
<int name="ps">100</int>
</requestHandler>
<requestHandler name="dismax" class="solr.DisMaxRequestHandler" >
<lst name="defaults">
<str name="q.alt">*:*</str>
<float name="tie">0.01</float>
<str name="qf">
text^0.5 features_t^1.0 subject^1.4 title_stemmed^2.0
</str>
<str name="pf">
text^0.2 features_t^1.1 subject^1.4 title_stemmed^2.0 title^1.5
</str>
<str name="bf">
ord(weight)^0.5 recip(rord(iind),1,1000,1000)^0.3
</str>
<str name="mm">
3&lt;-1 5&lt;-2 6&lt;90%
</str>
<int name="ps">100</int>
</lst>
</requestHandler>
<requestHandler name="old" class="solr.tst.OldRequestHandler" >
<int name="myparam">1000</int>
<float name="ratio">1.4142135</float>
<arr name="myarr"><int>1</int><int>2</int></arr>
<str>foo</str>
</requestHandler>
<requestHandler name="oldagain" class="solr.tst.OldRequestHandler" >
<lst name="lst1"> <str name="op">sqrt</str> <int name="val">2</int> </lst>
<lst name="lst2"> <str name="op">log</str> <float name="val">10</float> </lst>
</requestHandler>
<requestHandler name="test" class="solr.tst.TestRequestHandler" />
<!-- test query parameter defaults -->
<requestHandler name="defaults" class="solr.StandardRequestHandler">
<lst name="defaults">
<int name="rows">4</int>
<bool name="hl">true</bool>
<str name="hl.fl">text,name,subject,title,whitetok</str>
</lst>
</requestHandler>
<!-- test query parameter defaults -->
<requestHandler name="lazy" class="solr.StandardRequestHandler" startup="lazy">
<lst name="defaults">
<int name="rows">4</int>
<bool name="hl">true</bool>
<str name="hl.fl">text,name,subject,title,whitetok</str>
</lst>
</requestHandler>
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
<requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
<!-- test elevation -->
<searchComponent name="elevate" class="org.apache.solr.handler.component.QueryElevationComponent" >
<str name="queryFieldType">string</str>
<str name="config-file">elevate.xml</str>
</searchComponent>
<requestHandler name="/elevate" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults">
<str name="echoParams">explicit</str>
</lst>
<arr name="last-components">
<str>elevate</str>
</arr>
</requestHandler>
<highlighting class="org.apache.solr.highlight.DummyHighlighter">
<!-- Configure the standard fragmenter -->
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
<lst name="defaults">
<int name="hl.fragsize">100</int>
</lst>
</fragmenter>
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
<lst name="defaults">
<int name="hl.fragsize">70</int>
</lst>
</fragmenter>
<!-- Configure the standard formatter -->
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
<lst name="defaults">
<str name="hl.simple.pre"><![CDATA[<em>]]></str>
<str name="hl.simple.post"><![CDATA[</em>]]></str>
</lst>
</formatter>
</highlighting>
<!-- enable streaming for testing... -->
<requestDispatcher handleSelect="true" >
<requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048" />
<httpCaching lastModifiedFrom="openTime" etagSeed="Solr" never304="false">
<cacheControl>max-age=30, public</cacheControl>
</httpCaching>
</requestDispatcher>
<admin>
<defaultQuery>solr</defaultQuery>
<gettableFiles>solrconfig.xml scheam.xml admin-extra.html</gettableFiles>
</admin>
<!-- test getting system property -->
<propTest attr1="${solr.test.sys.prop1}-$${literal}"
attr2="${non.existent.sys.prop:default-from-config}">prefix-${solr.test.sys.prop2}-suffix</propTest>
<queryParser name="foo" class="FooQParserPlugin"/>
</config>