highlighting: SOLR-24

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@421678 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2006-07-13 18:22:24 +00:00
parent bf363aafc7
commit 1c06c77393
5 changed files with 380 additions and 273 deletions

View File

@ -18,14 +18,15 @@ New Features
10. copyField accepts dynamicfield-like names as the source.
(Darren Erik Vengroff via yonik, SOLR-21)
11. new DocSet.andNot(), DocSet.andNotSize() (yonik)
12. Ability to store term vectors. (Note: standard request handler does
not currently do anything with term vectors) (Mike Klaas via yonik, SOLR-23)
12. Ability to store term vectors for fields. (Mike Klaas via yonik, SOLR-23)
13. New abstract BufferedTokenStream for people who want to write
Tokenizers or TokenFilters that require arbitrary buffering of the
stream. (SOLR-11 / yonik, hossman)
14. New RemoveDuplicatesToken - useful in situations where
synonyms, stemming, or word-deliminater-ing produce identical tokens at
the same position. (SOLR-11 / yonik, hossman)
15. Added highlighting to SolrPluginUtils and implemented in StandardRequestHandler
and DisMaxRequestHandler (SOLR-24 / Mike Klaas via hossman,yonik)
Changes in runtime behavior
1. classes reorganized into different packages, package names changed to Apache

View File

@ -41,6 +41,7 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.util.StrUtils;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.DisMaxParams;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -161,7 +162,7 @@ public class DisMaxRequestHandler
/* :NOOP */
}
protected final U.CommonParams params = new U.CommonParams();
protected final DisMaxParams params = new DisMaxParams();
public DisMaxRequestHandler() {
super();
@ -219,6 +220,7 @@ public class DisMaxRequestHandler
try {
int flags = 0;
SolrIndexSearcher s = req.getSearcher();
IndexSchema schema = req.getSchema();
@ -267,7 +269,7 @@ public class DisMaxRequestHandler
if (dis instanceof BooleanQuery) {
BooleanQuery t = new BooleanQuery();
U.flatenBooleanQuery(t, (BooleanQuery)dis);
U.flattenBooleanQuery(t, (BooleanQuery)dis);
U.setMinShouldMatch(t, minShouldMatch);
@ -332,19 +334,19 @@ public class DisMaxRequestHandler
/* * * Generate Main Results * * */
flags |= U.setReturnFields(U.getParam(req, params.FL, params.fl), rsp);
DocList results = s.getDocList(query, restrictions,
SolrPluginUtils.getSort(req),
req.getStart(), req.getLimit(),
SolrIndexSearcher.GET_SCORES);
flags);
rsp.add("search-results",results);
U.setReturnFields(U.getParam(req, params.FL, params.fl), rsp);
/* * * Debugging Info * * */
try {
NamedList debug = U.doStandardDebug(req, userQuery, query, results);
NamedList debug = U.doStandardDebug(req, userQuery, query, results, params);
if (null != debug) {
debug.add("boostquery", boostQuery);
debug.add("boostfunc", boostFunc);
@ -364,6 +366,18 @@ public class DisMaxRequestHandler
rsp.add("exception_during_debug", SolrException.toStr(e));
}
/* * * Highlighting/Summarizing * * */
if(U.getBooleanParam(req, params.HIGHLIGHT, params.highlight)) {
BooleanQuery highlightQuery = new BooleanQuery();
U.flattenBooleanQuery(highlightQuery, query);
NamedList sumData = U.doStandardHighlighting(results, highlightQuery,
req, params,
queryFields.keySet().toArray(new String[0]));
if(sumData != null)
rsp.add("highlighting", sumData);
}
} catch (Exception e) {
SolrException.log(SolrCore.log,e);
rsp.setException(e);

View File

@ -29,6 +29,8 @@ import java.net.URL;
import org.apache.solr.util.StrUtils;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.CommonParams;
import org.apache.solr.search.*;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.core.SolrCore;
@ -47,14 +49,18 @@ public class StandardRequestHandler implements SolrRequestHandler, SolrInfoMBean
long numRequests;
long numErrors;
/** shorten the class referneces for utilities */
private static class U extends SolrPluginUtils {
/* :NOOP */
}
/** parameters garnered from config file */
protected final CommonParams params = new CommonParams();
public void init(NamedList args) {
SolrCore.log.log(Level.INFO, "Unused request handler arguments:" + args);
params.setValues(args);
}
private final Pattern splitList=Pattern.compile(",| ");
public void handleRequest(SolrQueryRequest req, SolrQueryResponse rsp) {
numRequests++;
@ -63,24 +69,14 @@ public class StandardRequestHandler implements SolrRequestHandler, SolrInfoMBean
// we need to un-escape them before we pass to QueryParser
try {
String sreq = req.getQueryString();
String debug = req.getParam("debugQuery");
String defaultField = req.getParam("df");
String debug = U.getParam(req, params.DEBUG_QUERY, params.debugQuery);
String defaultField = U.getParam(req, params.DF, params.df);
// find fieldnames to return (fieldlist)
String fl = req.getParam("fl");
int flags=0;
String fl = U.getParam(req, params.FL, params.fl);
int flags = 0;
if (fl != null) {
// TODO - this could become more efficient if widely used.
// TODO - should field order be maintained?
String[] flst = splitList.split(fl,0);
if (flst.length > 0 && !(flst.length==1 && flst[0].length()==0)) {
Set<String> set = new HashSet<String>();
for (String fname : flst) {
if ("score".equals(fname)) flags |= SolrIndexSearcher.GET_SCORES;
set.add(fname);
}
rsp.setReturnFields(set);
}
flags |= U.setReturnFields(fl, rsp);
}
if (sreq==null) throw new SolrException(400,"Missing queryString");
@ -104,25 +100,20 @@ public class StandardRequestHandler implements SolrRequestHandler, SolrInfoMBean
DocList results = req.getSearcher().getDocList(query, null, sort, req.getStart(), req.getLimit(), flags);
rsp.add(null,results);
if (debug!=null) {
NamedList dbg = new NamedList();
try {
dbg.add("querystring",qs);
dbg.add("parsedquery",QueryParsing.toString(query,req.getSchema()));
dbg.add("explain", getExplainList(query, results, req.getSearcher(), req.getSchema()));
String otherQueryS = req.getParam("explainOther");
if (otherQueryS != null && otherQueryS.length() > 0) {
DocList otherResults = doQuery(otherQueryS,req.getSearcher(), req.getSchema(),0,10);
dbg.add("otherQuery",otherQueryS);
dbg.add("explainOther", getExplainList(query, otherResults, req.getSearcher(), req.getSchema()));
}
NamedList dbg = U.doStandardDebug(req, qs, query, results, params);
if (null != dbg)
rsp.add("debug", dbg);
} catch (Exception e) {
SolrException.logOnce(SolrCore.log,"Exception during debug:",e);
dbg.add("exception_during_debug", SolrException.toStr(e));
}
rsp.add("debug",dbg);
SolrException.logOnce(SolrCore.log, "Exception durring debug", e);
rsp.add("exception_during_debug", SolrException.toStr(e));
}
NamedList sumData = SolrPluginUtils.doStandardHighlighting(
results, query, req, params, new String[]{defaultField});
if(sumData != null)
rsp.add("highlighting", sumData);
} catch (SolrException e) {
rsp.setException(e);
numErrors++;
@ -135,52 +126,6 @@ public class StandardRequestHandler implements SolrRequestHandler, SolrInfoMBean
}
}
private NamedList getExplainList(Query query, DocList results, SolrIndexSearcher searcher, IndexSchema schema) throws IOException {
NamedList explainList = new NamedList();
DocIterator iterator = results.iterator();
for (int i=0; i<results.size(); i++) {
int id = iterator.nextDoc();
Explanation explain = searcher.explain(query, id);
//explainList.add(Integer.toString(id), explain.toString().split("\n"));
Document doc = searcher.doc(id);
String strid = schema.printableUniqueKey(doc);
String docname = "";
if (strid != null) docname="id="+strid+",";
docname = docname + "internal_docid="+id;
explainList.add(docname, "\n" +explain.toString());
}
return explainList;
}
private DocList doQuery(String sreq, SolrIndexSearcher searcher, IndexSchema schema, int start, int limit) throws IOException {
List<String> commands = StrUtils.splitSmart(sreq,';');
String qs = commands.size() >= 1 ? commands.get(0) : "";
Query query = QueryParsing.parseQuery(qs, schema);
// If the first non-query, non-filter command is a simple sort on an indexed field, then
// we can use the Lucene sort ability.
Sort sort = null;
if (commands.size() >= 2) {
QueryParsing.SortSpec sortSpec = QueryParsing.parseSort(commands.get(1), schema);
if (sortSpec != null) {
sort = sortSpec.getSort();
if (sortSpec.getCount() >= 0) {
limit = sortSpec.getCount();
}
}
}
DocList results = searcher.getDocList(query,(DocSet)null, sort, start, limit);
return results;
}
//////////////////////// SolrInfoMBeans methods //////////////////////

View File

@ -16,6 +16,7 @@
package org.apache.solr.util;
import org.apache.solr.core.Config; // highlighting
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrInfoMBean;
import org.apache.solr.core.SolrException;
@ -40,6 +41,7 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.util.StrUtils;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.XML;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -53,9 +55,22 @@ import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.highlight.Highlighter; // highlighting
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.xmlpull.v1.XmlPullParserException;
@ -73,6 +88,8 @@ import java.util.Map;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter; // highlighting
import java.net.URL;
/**
@ -84,6 +101,9 @@ import java.net.URL;
* </p>
*
* <p>:TODO: refactor StandardRequestHandler to use these utilities</p>
*
* <p>:TODO: Many "standard" functionality methods are not cognisant of
* default parameter settings.
*/
public class SolrPluginUtils {
@ -108,6 +128,8 @@ public class SolrPluginUtils {
String param, String def) {
String v = req.getParam(param);
// Note: parameters passed but given only white-space value are
// considered equvalent to passing nothing for that parameter.
if (null == v || "".equals(v.trim())) {
return def;
}
@ -134,7 +156,18 @@ public class SolrPluginUtils {
return r;
}
/**
* Treats parameter value as a boolean. The string 'false' is false;
* any other non-empty string is true.
*/
public static boolean getBooleanParam(SolrQueryRequest req,
String param, boolean def) {
String v = req.getParam(param);
if (null == v || "".equals(v.trim())) {
return def;
}
return !"false".equals(v.trim());
}
private final static Pattern splitList=Pattern.compile(",| ");
@ -142,29 +175,36 @@ public class SolrPluginUtils {
* Assumes the standard query param of "fl" to specify the return fields
* @see #setReturnFields(String,SolrQueryResponse)
*/
public static void setReturnFields(SolrQueryRequest req,
public static int setReturnFields(SolrQueryRequest req,
SolrQueryResponse res) {
setReturnFields(req.getParam(FL), res);
return setReturnFields(req.getParam(FL), res);
}
/**
* Given a space seperated list of field names, sets the field list on the
* SolrQueryResponse.
*
* @return bitfield of SolrIndexSearcher flags that need to be set
*/
public static void setReturnFields(String fl,
public static int setReturnFields(String fl,
SolrQueryResponse res) {
int flags = 0;
if (fl != null) {
// TODO - this could become more efficient if widely used.
// TODO - should field order be maintained?
String[] flst = splitList.split(fl.trim(),0);
if (flst.length > 0 && !(flst.length==1 && flst[0].length()==0)) {
Set<String> set = new HashSet<String>();
for (String fname : flst) set.add(fname);
for (String fname : flst) {
if("score".equalsIgnoreCase(fname))
flags |= SolrIndexSearcher.GET_SCORES;
set.add(fname);
}
res.setReturnFields(set);
}
}
return flags;
}
/**
@ -201,24 +241,24 @@ public class SolrPluginUtils {
* @param query the query built from the userQuery
* (and perhaps other clauses) that identifies the main
* result set of the response.
* @param results the main result set of hte response
* @param results the main result set of the response
*/
public static NamedList doStandardDebug(SolrQueryRequest req,
String userQuery,
Query query,
DocList results)
DocList results,
CommonParams params)
throws IOException {
String debug = req.getParam("debugQuery");
String debug = getParam(req, params.DEBUG_QUERY, params.debugQuery);
NamedList dbg = null;
if (debug!=null) {
dbg = new NamedList();
/* userQuery may have been pre-processes .. expose that */
dbg.add("rawquerystring",req.getQueryString());
dbg.add("querystring",userQuery);
dbg.add("rawquerystring", req.getQueryString());
dbg.add("querystring", userQuery);
/* QueryParsing.toString isn't perfect, use it to see converted
* values, use regular toString to see any attributes of the
@ -274,6 +314,177 @@ public class SolrPluginUtils {
return explainList;
}
/**
* Retrieve a default Highlighter instance for a given query.
*
* @param query Query instance
*/
public static Highlighter getDefaultHighlighter(Query query) {
Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter("<em>", "</em>"),
new QueryScorer(query));
highlighter.setTextFragmenter(new GapFragmenter());
return highlighter;
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents. Convenience method that constructs a Highlighter from a
* Query.
*
* @param docs query results
* @param fieldNames list of fields to summarize
* @param query resulting query object
* @param searcher the SolrIndexSearcher corresponding to a request
* @param numFragments maximum number of summary fragments to return for
* a given field
*/
public static NamedList getHighlights(DocList docs,
String[] fieldNames,
Query query,
SolrIndexSearcher searcher,
int numFragments
) throws IOException {
return getHighlights(docs, fieldNames, searcher,
getDefaultHighlighter(query), numFragments);
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents
*
* @param docs query results
* @param fieldNames list of fields to summarize
* @param searcher the SolrIndexSearcher corresponding to a request
* @param numFragments maximum number of summary fragments to return for
* a given field
* @param highlighter a customized Highlighter instance
*
* @return NamedList containing a NamedList for each document, which in
* turns contains sets (field, summary) pairs.
*/
public static NamedList getHighlights(DocList docs,
String[] fieldNames,
SolrIndexSearcher searcher,
Highlighter highlighter,
int numFragments
) throws IOException {
NamedList fragments = new NamedList();
DocIterator iterator = docs.iterator();
for (int i=0; i<docs.size(); i++) {
int docId = iterator.nextDoc();
// use the Searcher's doc cache
Document doc = searcher.doc(docId);
NamedList docSummaries = new NamedList();
for(String fieldName : fieldNames) {
fieldName = fieldName.trim();
String[] docTexts = doc.getValues(fieldName);
if(docTexts == null)
continue;
String[] summaries;
TextFragment[] frag;
if(docTexts.length == 1) {
// single-valued field
TokenStream tstream;
try {
// attempt term vectors
tstream = TokenSources.getTokenStream(
searcher.getReader(), docId, fieldName);
} catch (IllegalArgumentException e) {
// fall back to analyzer
tstream = searcher.getSchema().getAnalyzer().tokenStream(
fieldName, new StringReader(docTexts[0]));
}
frag = highlighter.getBestTextFragments(
tstream, docTexts[0], false, numFragments);
} else {
// multi-valued field
MultiValueTokenStream tstream;
tstream = new MultiValueTokenStream(fieldName,
docTexts,
searcher.getSchema().getAnalyzer());
frag = highlighter.getBestTextFragments(
tstream, tstream.asSingleValue(), false, numFragments);
}
// convert fragments back into text
// TODO: we can include score and position information in output as
// snippet attributes
if(frag.length > 0) {
ArrayList fragTexts = new ArrayList();
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
fragTexts.add(frag[j].toString());
}
}
summaries = (String[]) fragTexts.toArray(new String[0]);
if(summaries.length > 0)
docSummaries.add(fieldName, summaries);
}
}
String printId = searcher.getSchema().printableUniqueKey(doc);
fragments.add(printId == null ? null : printId, docSummaries);
}
return fragments;
}
/**
* Perform highlighting of selected fields.
*
* @param docs query results
* @param query the (possibly re-written query)
* @param req associated SolrQueryRequest
* @param defaultFields default search field list
*
* @return NamedList containing summary data, or null if highlighting is
* disabled.
*
*/
public static NamedList doStandardHighlighting(DocList docs,
Query query,
SolrQueryRequest req,
CommonParams params,
String[] defaultFields
) throws IOException {
if(!getBooleanParam(req, params.HIGHLIGHT, params.highlight))
return null;
String fieldParam = getParam(req, params.HIGHLIGHT_FIELDS,
params.highlightFields);
String fields[];
if(fieldParam == null || fieldParam.trim().equals("")) {
// use default search field if highlight fieldlist not specified.
if (defaultFields == null || defaultFields.length == 0 ||
defaultFields[0] == null) {
fields = new String[]{req.getSchema().getDefaultSearchFieldName()};
} else
fields = defaultFields;
} else
fields = splitList.split(fieldParam.trim());
Highlighter highlighter;
String formatterSpec = getParam(req, params.HIGHLIGHT_FORMATTER_CLASS,
params.highlightFormatterClass);
if(formatterSpec == null || formatterSpec.equals("")) {
highlighter = getDefaultHighlighter(query);
} else {
highlighter = new Highlighter(
(Formatter)Config.newInstance(formatterSpec),
new QueryScorer(query));
highlighter.setTextFragmenter(new GapFragmenter());
}
int numFragments = getNumberParam(req, params.MAX_SNIPPETS,
params.maxSnippets).intValue();
return getHighlights(
docs,
fields,
req.getSearcher(),
highlighter,
numFragments);
}
/**
* Executes a basic query in lucene syntax
*/
@ -455,7 +666,7 @@ public class SolrPluginUtils {
* so do not attempt to reuse it.
* </p>
*/
public static void flatenBooleanQuery(BooleanQuery to, BooleanQuery from) {
public static void flattenBooleanQuery(BooleanQuery to, BooleanQuery from) {
BooleanClause[] c = from.getClauses();
for (int i = 0; i < c.length; i++) {
@ -468,7 +679,7 @@ public class SolrPluginUtils {
&& !c[i].isProhibited()) {
/* we can recurse */
flatenBooleanQuery(to, (BooleanQuery)ci);
flattenBooleanQuery(to, (BooleanQuery)ci);
} else {
to.add(c[i]);
@ -512,169 +723,6 @@ public class SolrPluginUtils {
return s.toString().replace("\"","");
}
/**
* A collection on common params, both for Plugin initialization and
* for Requests.
*/
public static class CommonParams {
/** query and init param for tiebreaker value */
public static String TIE = "tie";
/** query and init param for query fields */
public static String QF = "qf";
/** query and init param for phrase boost fields */
public static String PF = "pf";
/** query and init param for MinShouldMatch specification */
public static String MM = "mm";
/** query and init param for Phrase Slop value */
public static String PS = "ps";
/** query and init param for boosting query */
public static String BQ = "bq";
/** query and init param for boosting functions */
public static String BF = "bf";
/** query and init param for filtering query */
public static String FQ = "fq";
/** query and init param for field list */
public static String FL = "fl";
/** query and init param for field list */
public static String GEN = "gen";
/** the default tie breaker to use in DisjunctionMaxQueries */
public float tiebreaker = 0.0f;
/** the default query fields to be used */
public String qf = null;
/** the default phrase boosting fields to be used */
public String pf = null;
/** the default min should match to be used */
public String mm = "100%";
/** the default phrase slop to be used */
public int pslop = 0;
/** the default boosting query to be used */
public String bq = null;
/** the default boosting functions to be used */
public String bf = null;
/** the default filtering query to be used */
public String fq = null;
/** the default field list to be used */
public String fl = null;
public CommonParams() {
/* :NOOP: */
}
/** @see #setValues */
public CommonParams(NamedList args) {
this();
setValues(args);
}
/**
* Sets the params using values from a NamedList, usefull in the
* init method for your handler.
*
* <p>
* If any param is not of the expected type, a severe error is
* logged,and the param is skipped.
* </p>
*
* <p>
* If any param is not of in the NamedList, it is skipped and the
* old value is left alone.
* </p>
*
*/
public void setValues(NamedList args) {
Object tmp;
tmp = args.get(TIE);
if (null != tmp) {
if (tmp instanceof Float) {
tiebreaker = ((Float)tmp).floatValue();
} else {
SolrCore.log.severe("init param is not a float: " + TIE);
}
}
tmp = args.get(QF);
if (null != tmp) {
if (tmp instanceof String) {
qf = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + QF);
}
}
tmp = args.get(PF);
if (null != tmp) {
if (tmp instanceof String) {
pf = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + PF);
}
}
tmp = args.get(MM);
if (null != tmp) {
if (tmp instanceof String) {
mm = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + MM);
}
}
tmp = args.get(PS);
if (null != tmp) {
if (tmp instanceof Integer) {
pslop = ((Integer)tmp).intValue();
} else {
SolrCore.log.severe("init param is not an int: " + PS);
}
}
tmp = args.get(BQ);
if (null != tmp) {
if (tmp instanceof String) {
bq = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + BQ);
}
}
tmp = args.get(BF);
if (null != tmp) {
if (tmp instanceof String) {
bf = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + BF);
}
}
tmp = args.get(FQ);
if (null != tmp) {
if (tmp instanceof String) {
fq = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + FQ);
}
}
tmp = args.get(FL);
if (null != tmp) {
if (tmp instanceof String) {
fl = tmp.toString();
} else {
SolrCore.log.severe("init param is not a str: " + FL);
}
}
}
}
/**
* A subclass of SolrQueryParser that supports aliasing fields for
* constructing DisjunctionMaxQueries.
@ -763,8 +811,6 @@ public class SolrPluginUtils {
}
/**
* Determines the correct Sort based on the request parameter "sort"
*
@ -818,6 +864,105 @@ public class SolrPluginUtils {
}
}
}
/**
* Helper class which creates a single TokenStream out of values from a
* multi-valued field.
*/
class MultiValueTokenStream extends TokenStream {
private String fieldName;
private String[] values;
private Analyzer analyzer;
private int curIndex; // next index into the values array
private int curOffset; // offset into concatenated string
private TokenStream currentStream; // tokenStream currently being iterated
/** Constructs a TokenStream for consecutively-analyzed field values
*
* @param fieldName name of the field
* @param values array of field data
* @param analyzer analyzer instance
*/
public MultiValueTokenStream(String fieldName, String[] values,
Analyzer analyzer) {
this.fieldName = fieldName;
this.values = values;
this.analyzer = analyzer;
curIndex = -1;
curOffset = 0;
currentStream = null;
}
/** Returns the next token in the stream, or null at EOS. */
public Token next() throws IOException {
int extra = 0;
if(currentStream == null) {
curIndex++;
if(curIndex < values.length) {
currentStream = analyzer.tokenStream(fieldName,
new StringReader(values[curIndex]));
// add extra space between multiple values
if(curIndex > 0)
extra = analyzer.getPositionIncrementGap(fieldName);
} else {
return null;
}
}
Token nextToken = currentStream.next();
if(nextToken == null) {
curOffset += values[curIndex].length();
currentStream = null;
return next();
}
// create an modified token which is the offset into the concatenated
// string of all values
Token offsetToken = new Token(nextToken.termText(),
nextToken.startOffset() + curOffset,
nextToken.endOffset() + curOffset);
offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10);
return offsetToken;
}
/**
* Returns all values as a single String into which the Tokens index with
* their offsets.
*/
public String asSingleValue() {
StringBuilder sb = new StringBuilder();
for(String str : values)
sb.append(str);
return sb.toString();
}
}
/**
* A simple modification of SimpleFragmenter which additionally creates new
* fragments when an unusually-large position increment is encountered
* (this behaves much better in the presence of multi-valued fields).
*/
class GapFragmenter extends SimpleFragmenter {
public static final int INCREMENT_THRESHOLD = 50;
protected int fragOffsetAccum = 0;
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
*/
public void start(String originalText) {
fragOffsetAccum = 0;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
*/
public boolean isNewFragment(Token token) {
boolean isNewFrag =
token.endOffset() >= fragOffsetAccum + getFragmentSize() ||
token.getPositionIncrement() > INCREMENT_THRESHOLD;
if(isNewFrag) {
fragOffsetAccum += token.endOffset() - fragOffsetAccum;
}
return isNewFrag;
}
}

View File

@ -339,6 +339,8 @@
<dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
<dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
<dynamicField name="t_*" type="text" indexed="true" stored="true"/>
<dynamicField name="tv_*" type="text" indexed="true" stored="true"
termVectors="true" termPositions="true" termOffsets="true"/>
<!-- for testing to ensure that longer patterns are matched first -->