SOLR-225 -- adding pluggable highlighting formatters and fragmenters

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@552682 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2007-07-03 06:14:07 +00:00
parent 95b41b373f
commit 1e037fcf50
11 changed files with 857 additions and 11 deletions

View File

@ -87,6 +87,10 @@ New Features
RequestHandlers, FieldTypes, and QueryResponseWriters to share the same RequestHandlers, FieldTypes, and QueryResponseWriters to share the same
base code for loading and initalizing plugins. (ryan) base code for loading and initalizing plugins. (ryan)
14. SOLR-225: Enable pluggable highlighting classes. Allow configurable
highlighting formatters and Fragmenters. (ryan)
Changes in runtime behavior Changes in runtime behavior
Optimizations Optimizations

View File

@ -431,6 +431,31 @@
</lst> </lst>
</requestHandler> </requestHandler>
<!-- This should mostlikely be commented out in the "default" case -->
<highlighting>
<!-- Configure the standard fragmenter -->
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
<lst name="defaults">
<int name="hl.fragsize">100</int>
</lst>
</fragmenter>
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
<lst name="defaults">
<int name="hl.fragsize">70</int>
</lst>
</fragmenter>
<!-- Configure the standard formatter -->
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
<lst name="defaults">
<str name="hl.simple.pre"><![CDATA[<em>]]></str>
<str name="hl.simple.post"><![CDATA[</em>]]></str>
</lst>
</formatter>
</highlighting>
<!-- queryResponseWriter plugins... query responses will be written using the <!-- queryResponseWriter plugins... query responses will be written using the
writer specified by the 'wt' request parameter matching the name of a registered writer specified by the 'wt' request parameter matching the name of a registered
writer. writer.

View File

@ -27,14 +27,17 @@ public interface HighlightParams {
public static final String SIMPLE = "simple"; public static final String SIMPLE = "simple";
public static final String HIGHLIGHT = "hl"; public static final String HIGHLIGHT = "hl";
public static final String PREFIX = "hl."; public static final String PREFIX = "hl.";
public static final String FIELDS = PREFIX+"fl"; public static final String FIELDS = PREFIX+"fl";
public static final String SNIPPETS = PREFIX+"snippets"; public static final String SNIPPETS = PREFIX+"snippets";
public static final String FRAGSIZE = PREFIX+"fragsize"; public static final String FRAGSIZE = PREFIX+"fragsize";
public static final String FORMATTER = PREFIX+"formatter"; public static final String INCREMENT = PREFIX+"increment";
public static final String SIMPLE_PRE = PREFIX+SIMPLE+".pre"; public static final String SLOP = PREFIX+"slop";
public static final String MAX_CHARS = PREFIX+"maxAnalyzedChars";
public static final String FORMATTER = PREFIX+"formatter";
public static final String FRAGMENTER = PREFIX+"fragmenter";
public static final String SIMPLE_PRE = PREFIX+SIMPLE+".pre";
public static final String SIMPLE_POST = PREFIX+SIMPLE+".post"; public static final String SIMPLE_POST = PREFIX+SIMPLE+".post";
public static final String FIELD_MATCH = PREFIX+"requireFieldMatch"; public static final String FIELD_MATCH = PREFIX+"requireFieldMatch";
} }

View File

@ -0,0 +1,106 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.solr.common.params.DefaultSolrParams;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
public class GapFragmenter extends HighlightingPluginBase implements SolrFragmenter
{
public Fragmenter getFragmenter(String fieldName, SolrParams params )
{
numRequests++;
if( defaults != null ) {
params = new DefaultSolrParams( params, defaults );
}
int fragsize = params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, 100 );
return (fragsize <= 0) ? new NullFragmenter() : new LuceneGapFragmenter(fragsize);
}
///////////////////////////////////////////////////////////////////////
//////////////////////// SolrInfoMBeans methods ///////////////////////
///////////////////////////////////////////////////////////////////////
@Override
public String getDescription() {
return "GapFragmenter";
}
@Override
public String getVersion() {
return "$Revision$";
}
@Override
public String getSourceId() {
return "$Id$";
}
@Override
public String getSource() {
return "$URL$";
}
}
/**
* A simple modification of SimpleFragmenter which additionally creates new
* fragments when an unusually-large position increment is encountered
* (this behaves much better in the presence of multi-valued fields).
*/
class LuceneGapFragmenter extends SimpleFragmenter {
/**
* When a gap in term positions is observed that is at least this big, treat
* the gap as a fragment delimiter.
*/
public static final int INCREMENT_THRESHOLD = 50;
protected int fragOffsetAccum = 0;
public LuceneGapFragmenter() {
}
public LuceneGapFragmenter(int fragsize) {
super(fragsize);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
*/
public void start(String originalText) {
fragOffsetAccum = 0;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
*/
public boolean isNewFragment(Token token) {
boolean isNewFrag =
token.endOffset() >= fragOffsetAccum + getFragmentSize() ||
token.getPositionIncrement() > INCREMENT_THRESHOLD;
if(isNewFrag) {
fragOffsetAccum += token.endOffset() - fragOffsetAccum;
}
return isNewFrag;
}
}

View File

@ -0,0 +1,73 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import java.net.URL;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrInfoMBean;
/**
*
* @author ryan
* @since solr 1.3
*/
public abstract class HighlightingPluginBase implements SolrInfoMBean
{
protected long numRequests;
protected SolrParams defaults;
public void init(NamedList args) {
if( args != null ) {
Object o = args.get("defaults");
if (o != null && o instanceof NamedList ) {
defaults = SolrParams.toSolrParams((NamedList)o);
}
}
}
//////////////////////// SolrInfoMBeans methods //////////////////////
public String getName() {
return this.getClass().getName();
}
public abstract String getDescription();
public abstract String getSourceId();
public abstract String getSource();
public abstract String getVersion();
public Category getCategory()
{
return Category.HIGHLIGHTING;
}
public URL[] getDocs() {
return null; // this can be overridden, but not required
}
public NamedList getStatistics() {
NamedList<Long> lst = new SimpleOrderedMap<Long>();
lst.add("requests", numRequests);
return lst;
}
}

View File

@ -0,0 +1,65 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.solr.common.params.DefaultSolrParams;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
/**
* Use the SimpleHTMLFormatter
*/
public class HtmlFormatter extends HighlightingPluginBase implements SolrFormatter
{
public Formatter getFormatter(String fieldName, SolrParams params )
{
numRequests++;
if( defaults != null ) {
params = new DefaultSolrParams( params, defaults );
}
return new SimpleHTMLFormatter(
params.getFieldParam(fieldName, HighlightParams.SIMPLE_PRE, "<em>" ),
params.getFieldParam(fieldName, HighlightParams.SIMPLE_POST, "</em>"));
}
///////////////////////////////////////////////////////////////////////
//////////////////////// SolrInfoMBeans methods ///////////////////////
///////////////////////////////////////////////////////////////////////
@Override
public String getDescription() {
return "GapFragmenter";
}
@Override
public String getVersion() {
return "$Revision$";
}
@Override
public String getSourceId() {
return "$Id$";
}
@Override
public String getSource() {
return "$URL$";
}
}

View File

@ -0,0 +1,44 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrInfoMBean;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
public interface SolrFormatter extends SolrInfoMBean, NamedListInitializedPlugin {
/** <code>init</code> will be called just once, immediately after creation.
* <p>The args are user-level initialization parameters that
* may be specified when declaring a request handler in
* solrconfig.xml
*/
public void init(NamedList args);
/**
* Return a formatter appropriate for this field.
*
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
* @return An appropriate Formatter.
*/
public Formatter getFormatter(String fieldName, SolrParams params );
}

View File

@ -0,0 +1,44 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrInfoMBean;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
public interface SolrFragmenter extends SolrInfoMBean, NamedListInitializedPlugin {
/** <code>init</code> will be called just once, immediately after creation.
* <p>The args are user-level initialization parameters that
* may be specified when declaring a request handler in
* solrconfig.xml
*/
public void init(NamedList args);
/**
* Return a fragmenter appropriate for this field.
*
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
* @return An appropriate Fragmenter.
*/
public Fragmenter getFragmenter(String fieldName, SolrParams params);
}

View File

@ -0,0 +1,435 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import javax.xml.xpath.XPathConstants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.Config;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListPluginLoader;
import org.w3c.dom.NodeList;
public class SolrHighlighter
{
public static Logger log = Logger.getLogger(SolrHighlighter.class.getName());
// Thread safe registry
protected final Map<String,SolrFormatter> formatters =
Collections.synchronizedMap( new HashMap<String, SolrFormatter>() );
// Thread safe registry
protected final Map<String,SolrFragmenter> fragmenters =
Collections.synchronizedMap( new HashMap<String, SolrFragmenter>() );
public void initalize( Config config )
{
formatters.clear();
fragmenters.clear();
// Load the fragmenters
String xpath = "highlighting/fragmenter";
NamedListPluginLoader<SolrFragmenter> fragloader = new NamedListPluginLoader<SolrFragmenter>( xpath, fragmenters );
SolrFragmenter frag = fragloader.load( (NodeList)config.evaluate( xpath, XPathConstants.NODESET ) );
if( frag == null ) {
frag = new GapFragmenter();
}
fragmenters.put( "", frag );
fragmenters.put( null, frag );
// Load the formatters
xpath = "highlighting/formatter";
NamedListPluginLoader<SolrFormatter> fmtloader = new NamedListPluginLoader<SolrFormatter>( xpath, formatters );
SolrFormatter fmt = fmtloader.load( (NodeList)config.evaluate( xpath, XPathConstants.NODESET ) );
if( fmt == null ) {
fmt = new HtmlFormatter();
}
formatters.put( "", fmt );
formatters.put( null, fmt );
}
/**
* Check whether Highlighting is enabled for this request.
* @param request The current SolrQueryRequest
* @return <code>true</code> if highlighting enabled, <code>false</code> if not.
*/
public boolean isHighlightingEnabled(SolrParams params) {
return params.getBool(HighlightParams.HIGHLIGHT, false);
}
/**
* Return a Highlighter appropriate for this field.
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
*/
protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) {
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
}
/**
* Return a QueryScorer suitable for this Query and field.
* @param query The current query
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
protected QueryScorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryScorer(query, request.getSearcher().getReader(), fieldName);
}
else {
return new QueryScorer(query);
}
}
/**
* Return a String array of the fields to be highlighted.
* Falls back to the programatic defaults, or the default search field if the list of fields
* is not specified in either the handler configuration or the request.
* @param query The current Query
* @param request The current SolrQueryRequest
* @param defaultFields Programmatic default highlight fields, used if nothing is specified in the handler config or the request.
*/
public String[] getHighlightFields(Query query, SolrQueryRequest request, String[] defaultFields) {
String fields[] = request.getParams().getParams(HighlightParams.FIELDS);
// if no fields specified in the request, or the handler, fall back to programmatic default, or default search field.
if(emptyArray(fields)) {
// use default search field if highlight fieldlist not specified.
if (emptyArray(defaultFields)) {
fields = new String[]{request.getSchema().getDefaultSearchFieldName()};
}
else {
fields = defaultFields;
}
}
else if (fields.length == 1) {
// if there's a single request/handler value, it may be a space/comma separated list
fields = SolrPluginUtils.split(fields[0]);
}
return fields;
}
protected boolean emptyArray(String[] arr) {
return (arr == null || arr.length == 0 || arr[0] == null || arr[0].trim().length() == 0);
}
/**
* Return the max number of snippets for this field. If this has not
* been configured for this field, fall back to the configured default
* or the solr default.
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
*/
protected int getMaxSnippets(String fieldName, SolrParams params) {
return params.getFieldInt(fieldName, HighlightParams.SNIPPETS,1);
}
/**
* Return a formatter appropriate for this field. If a formatter
* has not been configured for this field, fall back to the configured
* default or the solr default (SimpleHTMLFormatter).
*
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
* @return An appropriate Formatter.
*/
protected Formatter getFormatter(String fieldName, SolrParams params )
{
String str = params.getFieldParam( fieldName, HighlightParams.FORMATTER );
SolrFormatter formatter = formatters.get( str );
if( formatter == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: "+str );
}
return formatter.getFormatter( fieldName, params );
}
/**
* Return a fragmenter appropriate for this field. If a fragmenter
* has not been configured for this field, fall back to the configured
* default or the solr default (GapFragmenter).
*
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
* @return An appropriate Fragmenter.
*/
protected Fragmenter getFragmenter(String fieldName, SolrParams params)
{
String fmt = params.getFieldParam( fieldName, HighlightParams.FRAGMENTER );
SolrFragmenter frag = fragmenters.get( fmt );
if( frag == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: "+fmt );
}
return frag.getFragmenter( fieldName, params );
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents, or returns null if highlighting is disabled.
*
* @param docs query results
* @param query the query
* @param req the current request
* @param defaultFields default list of fields to summarize
*
* @return NamedList containing a NamedList for each document, which in
* turns contains sets (field, summary) pairs.
*/
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
if (!isHighlightingEnabled(params))
return null;
SolrIndexSearcher searcher = req.getSearcher();
IndexSchema schema = searcher.getSchema();
NamedList fragments = new SimpleOrderedMap();
String[] fieldNames = getHighlightFields(query, req, defaultFields);
Document[] readDocs = new Document[docs.size()];
{
// pre-fetch documents using the Searcher's doc cache
Set<String> fset = new HashSet<String>();
for(String f : fieldNames) { fset.add(f); }
// fetch unique key if one exists.
SchemaField keyField = schema.getUniqueKeyField();
if(null != keyField)
fset.add(keyField.getName());
searcher.readDocs(readDocs, docs, fset);
}
// Highlight each document
DocIterator iterator = docs.iterator();
for (int i = 0; i < docs.size(); i++) {
int docId = iterator.nextDoc();
Document doc = readDocs[i];
NamedList docSummaries = new SimpleOrderedMap();
for (String fieldName : fieldNames) {
fieldName = fieldName.trim();
String[] docTexts = doc.getValues(fieldName);
if (docTexts == null) continue;
// get highlighter, and number of fragments for this field
Highlighter highlighter = getHighlighter(query, fieldName, req);
int numFragments = getMaxSnippets(fieldName, params);
String[] summaries;
TextFragment[] frag;
if (docTexts.length == 1) {
// single-valued field
TokenStream tstream;
try {
// attempt term vectors
tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName);
}
catch (IllegalArgumentException e) {
// fall back to analyzer
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
}
frag = highlighter.getBestTextFragments(tstream, docTexts[0], false, numFragments);
}
else {
// multi-valued field
MultiValueTokenStream tstream;
tstream = new MultiValueTokenStream(fieldName, docTexts, schema.getAnalyzer(), true);
frag = highlighter.getBestTextFragments(tstream, tstream.asSingleValue(), false, numFragments);
}
// convert fragments back into text
// TODO: we can include score and position information in output as snippet attributes
if (frag.length > 0) {
ArrayList<String> fragTexts = new ArrayList<String>();
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
fragTexts.add(frag[j].toString());
}
}
summaries = fragTexts.toArray(new String[0]);
if (summaries.length > 0)
docSummaries.add(fieldName, summaries);
}
}
String printId = schema.printableUniqueKey(doc);
fragments.add(printId == null ? null : printId, docSummaries);
}
return fragments;
}
}
/**
* Helper class which creates a single TokenStream out of values from a
* multi-valued field.
*/
class MultiValueTokenStream extends TokenStream {
private String fieldName;
private String[] values;
private Analyzer analyzer;
private int curIndex; // next index into the values array
private int curOffset; // offset into concatenated string
private TokenStream currentStream; // tokenStream currently being iterated
private boolean orderTokenOffsets;
/** Constructs a TokenStream for consecutively-analyzed field values
*
* @param fieldName name of the field
* @param values array of field data
* @param analyzer analyzer instance
*/
public MultiValueTokenStream(String fieldName, String[] values,
Analyzer analyzer, boolean orderTokenOffsets) {
this.fieldName = fieldName;
this.values = values;
this.analyzer = analyzer;
curIndex = -1;
curOffset = 0;
currentStream = null;
this.orderTokenOffsets=orderTokenOffsets;
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public Token next() throws IOException {
int extra = 0;
if(currentStream == null) {
curIndex++;
if(curIndex < values.length) {
currentStream = analyzer.tokenStream(fieldName,
new StringReader(values[curIndex]));
if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10);
// add extra space between multiple values
if(curIndex > 0)
extra = analyzer.getPositionIncrementGap(fieldName);
} else {
return null;
}
}
Token nextToken = currentStream.next();
if(nextToken == null) {
curOffset += values[curIndex].length();
currentStream = null;
return next();
}
// create an modified token which is the offset into the concatenated
// string of all values
Token offsetToken = new Token(nextToken.termText(),
nextToken.startOffset() + curOffset,
nextToken.endOffset() + curOffset);
offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10);
return offsetToken;
}
/**
* Returns all values as a single String into which the Tokens index with
* their offsets.
*/
public String asSingleValue() {
StringBuilder sb = new StringBuilder();
for(String str : values)
sb.append(str);
return sb.toString();
}
}
/** Orders Tokens in a window first by their startOffset ascending.
* endOffset is currently ignored.
* This is meant to work around fickleness in the highlighter only. It
* can mess up token positions and should not be used for indexing or querying.
*/
class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
private final LinkedList<Token> queue = new LinkedList<Token>();
private boolean done=false;
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;
}
@Override
public Token next() throws IOException {
while (!done && queue.size() < windowSize) {
Token newTok = input.next();
if (newTok==null) {
done=true;
break;
}
// reverse iterating for better efficiency since we know the
// list is already sorted, and most token start offsets will be too.
ListIterator<Token> iter = queue.listIterator(queue.size());
while(iter.hasPrevious()) {
if (newTok.startOffset() >= iter.previous().startOffset()) {
// insertion will be before what next() would return (what
// we just compared against), so move back one so the insertion
// will be after.
iter.next();
break;
}
}
iter.add(newTok);
}
return queue.isEmpty() ? null : queue.removeFirst();
}
}

View File

@ -17,10 +17,12 @@
package org.apache.solr.highlight; package org.apache.solr.highlight;
import java.util.HashMap; import org.apache.solr.core.SolrCore;
import org.apache.solr.request.*;
import org.apache.solr.util.*;
import org.apache.solr.schema.*;
import org.apache.solr.util.AbstractSolrTestCase; import java.util.HashMap;
import org.apache.solr.util.TestHarness;
/** /**
* Tests some basic functionality of Solr while demonstrating good * Tests some basic functionality of Solr while demonstrating good
@ -45,6 +47,27 @@ public class HighlighterTest extends AbstractSolrTestCase {
super.tearDown(); super.tearDown();
} }
public void testConfig()
{
SolrHighlighter highlighter = SolrCore.getSolrCore().getHighlighter();
System.out.println( "highlighter" );
// Make sure we loaded the one formatter
SolrFormatter fmt1 = highlighter.formatters.get( null );
SolrFormatter fmt2 = highlighter.formatters.get( "" );
assertSame( fmt1, fmt2 );
assertTrue( fmt1 instanceof HtmlFormatter );
// Make sure we loaded the one formatter
SolrFragmenter gap = highlighter.fragmenters.get( "gap" );
SolrFragmenter regex = highlighter.fragmenters.get( "regex" );
SolrFragmenter frag = highlighter.fragmenters.get( null );
assertSame( gap, frag );
assertTrue( gap instanceof GapFragmenter );
assertTrue( regex instanceof RegexFragmenter );
}
public void testTermVecHighlight() { public void testTermVecHighlight() {
// do summarization using term vectors // do summarization using term vectors

View File

@ -265,6 +265,30 @@
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler" /> <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
<requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" /> <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
<highlighting>
<!-- Configure the standard fragmenter -->
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
<lst name="defaults">
<int name="hl.fragsize">100</int>
</lst>
</fragmenter>
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
<lst name="defaults">
<int name="hl.fragsize">70</int>
</lst>
</fragmenter>
<!-- Configure the standard formatter -->
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
<lst name="defaults">
<str name="hl.simple.pre"><![CDATA[<em>]]></str>
<str name="hl.simple.post"><![CDATA[</em>]]></str>
</lst>
</formatter>
</highlighting>
<!-- enable streaming for testing... --> <!-- enable streaming for testing... -->
<requestDispatcher handleSelect="true" > <requestDispatcher handleSelect="true" >
<requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048" /> <requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048" />