diff --git a/src/java/org/apache/solr/request/SolrParams.java b/src/java/org/apache/solr/request/SolrParams.java index a6c5a3439f8..dd2d5e1718f 100644 --- a/src/java/org/apache/solr/request/SolrParams.java +++ b/src/java/org/apache/solr/request/SolrParams.java @@ -60,14 +60,6 @@ public abstract class SolrParams { public static final String DEBUG_QUERY = "debugQuery"; /** another query to explain against */ public static final String EXPLAIN_OTHER = "explainOther"; - /** wether to highlight */ - public static final String HIGHLIGHT = "highlight"; - /** fields to highlight */ - public static final String HIGHLIGHT_FIELDS = "highlightFields"; - /** maximum highlight fragments to return */ - public static final String MAX_SNIPPETS = "maxSnippets"; - /** override default highlight Formatter class */ - public static final String HIGHLIGHT_FORMATTER_CLASS = "highlightFormatterClass"; /** * Should facet counts be calculated? diff --git a/src/java/org/apache/solr/util/HighlightingUtils.java b/src/java/org/apache/solr/util/HighlightingUtils.java index 3b41f8c33ca..3de90bbcade 100644 --- a/src/java/org/apache/solr/util/HighlightingUtils.java +++ b/src/java/org/apache/solr/util/HighlightingUtils.java @@ -18,11 +18,14 @@ package org.apache.solr.util; import java.io.IOException; import java.io.StringReader; -import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.List; +import java.util.LinkedList; +import java.util.ArrayList; +import java.util.ListIterator; import org.apache.solr.request.*; import org.apache.solr.search.DocIterator; @@ -30,7 +33,7 @@ import org.apache.solr.search.DocList; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.schema.SchemaField; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.*; import org.apache.lucene.document.Document; import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.*; @@ -38,8 +41,7 @@ import org.apache.lucene.search.highlight.*; /** * Collection of Utility and Factory methods for Highlighting. */ -public class HighlightingUtils -{ +public class HighlightingUtils { private static final String SIMPLE = "simple"; private static final String HIGHLIGHT = "hl"; @@ -53,8 +55,7 @@ public class HighlightingUtils private static final String FIELD_MATCH = PREFIX+"requireFieldMatch"; private static SolrParams DEFAULTS = null; - static - { + static { Map map = new HashMap(); map.put(SNIPPETS, "1"); map.put(FRAGSIZE, "100"); @@ -66,8 +67,7 @@ public class HighlightingUtils } /** Combine request parameters with highlighting defaults. */ - private static SolrParams getParams(SolrQueryRequest request) - { + private static SolrParams getParams(SolrQueryRequest request) { return new DefaultSolrParams(request.getParams(), DEFAULTS); } @@ -76,8 +76,7 @@ public class HighlightingUtils * @param request The current SolrQueryRequest * @return true if highlighting enabled, false if not. */ - public static boolean isHighlightingEnabled(SolrQueryRequest request) - { + public static boolean isHighlightingEnabled(SolrQueryRequest request) { return getParams(request).getBool(HIGHLIGHT, false); } @@ -87,8 +86,7 @@ public class HighlightingUtils * @param fieldName The name of the field * @param request The current SolrQueryRequest */ - public static Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) - { + public static Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) { Highlighter highlighter = new Highlighter( getFormatter(fieldName, request), getQueryScorer(query, fieldName, request)); @@ -102,15 +100,12 @@ public class HighlightingUtils * @param fieldName The name of the field * @param request The SolrQueryRequest */ - public static QueryScorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) - { + public static QueryScorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) { boolean reqFieldMatch = getParams(request).getFieldBool(fieldName, FIELD_MATCH, false); - if (reqFieldMatch) - { + if (reqFieldMatch) { return new QueryScorer(query, request.getSearcher().getReader(), fieldName); } - else - { + else { return new QueryScorer(query); } } @@ -123,25 +118,20 @@ public class HighlightingUtils * @param request The current SolrQueryRequest * @param defaultFields Programmatic default highlight fields, used if nothing is specified in the handler config or the request. */ - public static String[] getHighlightFields(Query query, SolrQueryRequest request, String[] defaultFields) - { + public static String[] getHighlightFields(Query query, SolrQueryRequest request, String[] defaultFields) { String fields[] = getParams(request).getParams(FIELDS); // if no fields specified in the request, or the handler, fall back to programmatic default, or default search field. - if(emptyArray(fields)) - { + if(emptyArray(fields)) { // use default search field if highlight fieldlist not specified. - if (emptyArray(defaultFields)) - { + if (emptyArray(defaultFields)) { fields = new String[]{request.getSchema().getDefaultSearchFieldName()}; } - else - { + else { fields = defaultFields; } } - else if (fields.length == 1) - { + else if (fields.length == 1) { // if there's a single request/handler value, it may be a space/comma separated list fields = SolrPluginUtils.split(fields[0]); } @@ -149,8 +139,7 @@ public class HighlightingUtils return fields; } - private static boolean emptyArray(String[] arr) - { + private static boolean emptyArray(String[] arr) { return (arr == null || arr.length == 0 || arr[0] == null || arr[0].trim().length() == 0); } @@ -161,8 +150,7 @@ public class HighlightingUtils * @param fieldName The name of the field * @param request The current SolrQueryRequest */ - public static int getMaxSnippets(String fieldName, SolrQueryRequest request) - { + public static int getMaxSnippets(String fieldName, SolrQueryRequest request) { return Integer.parseInt(getParams(request).getFieldParam(fieldName, SNIPPETS)); } @@ -175,8 +163,7 @@ public class HighlightingUtils * @param request The current SolrQueryRequest * @return An appropriate Formatter. */ - public static Formatter getFormatter(String fieldName, SolrQueryRequest request) - { + public static Formatter getFormatter(String fieldName, SolrQueryRequest request) { SolrParams p = getParams(request); // SimpleHTMLFormatter is the only supported Formatter at the moment @@ -192,8 +179,7 @@ public class HighlightingUtils * @param request The current SolrQueryRequest * @return An appropriate Fragmenter. */ - public static Fragmenter getFragmenter(String fieldName, SolrQueryRequest request) - { + public static Fragmenter getFragmenter(String fieldName, SolrQueryRequest request) { int fragsize = Integer.parseInt(getParams(request).getFieldParam(fieldName, FRAGSIZE)); return (fragsize <= 0) ? new NullFragmenter() : new GapFragmenter(fragsize); } @@ -210,8 +196,7 @@ public class HighlightingUtils * @return NamedList containing a NamedList for each document, which in * turns contains sets (field, summary) pairs. */ - public static NamedList doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException - { + public static NamedList doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException { if (!isHighlightingEnabled(req)) return null; @@ -232,13 +217,11 @@ public class HighlightingUtils // Highlight each document DocIterator iterator = docs.iterator(); - for (int i = 0; i < docs.size(); i++) - { + for (int i = 0; i < docs.size(); i++) { int docId = iterator.nextDoc(); Document doc = readDocs[i]; NamedList docSummaries = new SimpleOrderedMap(); - for (String fieldName : fieldNames) - { + for (String fieldName : fieldNames) { fieldName = fieldName.trim(); String[] docTexts = doc.getValues(fieldName); if (docTexts == null) continue; @@ -249,24 +232,20 @@ public class HighlightingUtils String[] summaries; TextFragment[] frag; - if (docTexts.length == 1) - { + if (docTexts.length == 1) { // single-valued field TokenStream tstream; - try - { + try { // attempt term vectors tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName); } - catch (IllegalArgumentException e) - { + catch (IllegalArgumentException e) { // fall back to analyzer tstream = new TokenOrderingFilter(searcher.getSchema().getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10); } frag = highlighter.getBestTextFragments(tstream, docTexts[0], false, numFragments); } - else - { + else { // multi-valued field MultiValueTokenStream tstream; tstream = new MultiValueTokenStream(fieldName, docTexts, searcher.getSchema().getAnalyzer(), true); @@ -274,18 +253,16 @@ public class HighlightingUtils } // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes - if (frag.length > 0) - { + if (frag.length > 0) { ArrayList fragTexts = new ArrayList(); - for (int j = 0; j < frag.length; j++) - { - if ((frag[j] != null) && (frag[j].getScore() > 0)) - { + for (int j = 0; j < frag.length; j++) { + if ((frag[j] != null) && (frag[j].getScore() > 0)) { fragTexts.add(frag[j].toString()); } } summaries = fragTexts.toArray(new String[0]); - if (summaries.length > 0) docSummaries.add(fieldName, summaries); + if (summaries.length > 0) + docSummaries.add(fieldName, summaries); } } String printId = searcher.getSchema().printableUniqueKey(doc); @@ -294,3 +271,160 @@ public class HighlightingUtils return fragments; } } + +/** + * Helper class which creates a single TokenStream out of values from a + * multi-valued field. + */ +class MultiValueTokenStream extends TokenStream { + private String fieldName; + private String[] values; + private Analyzer analyzer; + private int curIndex; // next index into the values array + private int curOffset; // offset into concatenated string + private TokenStream currentStream; // tokenStream currently being iterated + private boolean orderTokenOffsets; + + /** Constructs a TokenStream for consecutively-analyzed field values + * + * @param fieldName name of the field + * @param values array of field data + * @param analyzer analyzer instance + */ + public MultiValueTokenStream(String fieldName, String[] values, + Analyzer analyzer, boolean orderTokenOffsets) { + this.fieldName = fieldName; + this.values = values; + this.analyzer = analyzer; + curIndex = -1; + curOffset = 0; + currentStream = null; + this.orderTokenOffsets=orderTokenOffsets; + } + + /** Returns the next token in the stream, or null at EOS. */ + public Token next() throws IOException { + int extra = 0; + if(currentStream == null) { + curIndex++; + if(curIndex < values.length) { + currentStream = analyzer.tokenStream(fieldName, + new StringReader(values[curIndex])); + if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10); + // add extra space between multiple values + if(curIndex > 0) + extra = analyzer.getPositionIncrementGap(fieldName); + } else { + return null; + } + } + Token nextToken = currentStream.next(); + if(nextToken == null) { + curOffset += values[curIndex].length(); + currentStream = null; + return next(); + } + // create an modified token which is the offset into the concatenated + // string of all values + Token offsetToken = new Token(nextToken.termText(), + nextToken.startOffset() + curOffset, + nextToken.endOffset() + curOffset); + offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10); + return offsetToken; + } + + /** + * Returns all values as a single String into which the Tokens index with + * their offsets. + */ + public String asSingleValue() { + StringBuilder sb = new StringBuilder(); + for(String str : values) + sb.append(str); + return sb.toString(); + } + +} + +/** + * A simple modification of SimpleFragmenter which additionally creates new + * fragments when an unusually-large position increment is encountered + * (this behaves much better in the presence of multi-valued fields). + */ +class GapFragmenter extends SimpleFragmenter { + /** + * When a gap in term positions is observed that is at least this big, treat + * the gap as a fragment delimiter. + */ + public static final int INCREMENT_THRESHOLD = 50; + protected int fragOffsetAccum = 0; + + public GapFragmenter() { + } + + public GapFragmenter(int fragsize) { + super(fragsize); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) + */ + public void start(String originalText) { + fragOffsetAccum = 0; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) + */ + public boolean isNewFragment(Token token) { + boolean isNewFrag = + token.endOffset() >= fragOffsetAccum + getFragmentSize() || + token.getPositionIncrement() > INCREMENT_THRESHOLD; + if(isNewFrag) { + fragOffsetAccum += token.endOffset() - fragOffsetAccum; + } + return isNewFrag; + } +} + +/** Orders Tokens in a window first by their startOffset ascending. + * endOffset is currently ignored. + * This is meant to work around fickleness in the highlighter only. It + * can mess up token positions and should not be used for indexing or querying. + */ +class TokenOrderingFilter extends TokenFilter { + private final int windowSize; + private final LinkedList queue = new LinkedList(); + private boolean done=false; + + protected TokenOrderingFilter(TokenStream input, int windowSize) { + super(input); + this.windowSize = windowSize; + } + + public Token next() throws IOException { + while (!done && queue.size() < windowSize) { + Token newTok = input.next(); + if (newTok==null) { + done=true; + break; + } + + // reverse iterating for better efficiency since we know the + // list is already sorted, and most token start offsets will be too. + ListIterator iter = queue.listIterator(queue.size()); + while(iter.hasPrevious()) { + if (newTok.startOffset() >= iter.previous().startOffset()) { + // insertion will be before what next() would return (what + // we just compared against), so move back one so the insertion + // will be after. + iter.next(); + break; + } + } + iter.add(newTok); + } + + return queue.isEmpty() ? null : queue.removeFirst(); + } +} diff --git a/src/java/org/apache/solr/util/SolrPluginUtils.java b/src/java/org/apache/solr/util/SolrPluginUtils.java index 6ee167e070f..7dc1503bc96 100644 --- a/src/java/org/apache/solr/util/SolrPluginUtils.java +++ b/src/java/org/apache/solr/util/SolrPluginUtils.java @@ -18,15 +18,11 @@ package org.apache.solr.util; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.highlight.*; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrException; import org.apache.solr.request.SolrParams; @@ -849,156 +845,5 @@ public class SolrPluginUtils { } -/** - * Helper class which creates a single TokenStream out of values from a - * multi-valued field. - */ -class MultiValueTokenStream extends TokenStream { - private String fieldName; - private String[] values; - private Analyzer analyzer; - private int curIndex; // next index into the values array - private int curOffset; // offset into concatenated string - private TokenStream currentStream; // tokenStream currently being iterated - private boolean orderTokenOffsets; - - /** Constructs a TokenStream for consecutively-analyzed field values - * - * @param fieldName name of the field - * @param values array of field data - * @param analyzer analyzer instance - */ - public MultiValueTokenStream(String fieldName, String[] values, - Analyzer analyzer, boolean orderTokenOffsets) { - this.fieldName = fieldName; - this.values = values; - this.analyzer = analyzer; - curIndex = -1; - curOffset = 0; - currentStream = null; - this.orderTokenOffsets=orderTokenOffsets; - } - - /** Returns the next token in the stream, or null at EOS. */ - public Token next() throws IOException { - int extra = 0; - if(currentStream == null) { - curIndex++; - if(curIndex < values.length) { - currentStream = analyzer.tokenStream(fieldName, - new StringReader(values[curIndex])); - if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10); - // add extra space between multiple values - if(curIndex > 0) - extra = analyzer.getPositionIncrementGap(fieldName); - } else { - return null; - } - } - Token nextToken = currentStream.next(); - if(nextToken == null) { - curOffset += values[curIndex].length(); - currentStream = null; - return next(); - } - // create an modified token which is the offset into the concatenated - // string of all values - Token offsetToken = new Token(nextToken.termText(), - nextToken.startOffset() + curOffset, - nextToken.endOffset() + curOffset); - offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10); - return offsetToken; - } - - /** - * Returns all values as a single String into which the Tokens index with - * their offsets. - */ - public String asSingleValue() { - StringBuilder sb = new StringBuilder(); - for(String str : values) - sb.append(str); - return sb.toString(); - } - -} - -/** - * A simple modification of SimpleFragmenter which additionally creates new - * fragments when an unusually-large position increment is encountered - * (this behaves much better in the presence of multi-valued fields). - */ -class GapFragmenter extends SimpleFragmenter { - public static final int INCREMENT_THRESHOLD = 50; - protected int fragOffsetAccum = 0; - - public GapFragmenter() { - } - - public GapFragmenter(int fragsize) { - super(fragsize); - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) - */ - public void start(String originalText) { - fragOffsetAccum = 0; - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) - */ - public boolean isNewFragment(Token token) { - boolean isNewFrag = - token.endOffset() >= fragOffsetAccum + getFragmentSize() || - token.getPositionIncrement() > INCREMENT_THRESHOLD; - if(isNewFrag) { - fragOffsetAccum += token.endOffset() - fragOffsetAccum; - } - return isNewFrag; - } -} -/** Orders Tokens in a window first by their startOffset ascending. - * endOffset is currently ignored. - * This is meant to work around fickleness in the highlighter only. It - * can mess up token positions and should not be used for indexing or querying. - */ -class TokenOrderingFilter extends TokenFilter { - private final int windowSize; - private final LinkedList queue = new LinkedList(); - private boolean done=false; - - protected TokenOrderingFilter(TokenStream input, int windowSize) { - super(input); - this.windowSize = windowSize; - } - - public Token next() throws IOException { - while (!done && queue.size() < windowSize) { - Token newTok = input.next(); - if (newTok==null) { - done=true; - break; - } - - // reverse iterating for better efficiency since we know the - // list is already sorted, and most token start offsets will be too. - ListIterator iter = queue.listIterator(queue.size()); - while(iter.hasPrevious()) { - if (newTok.startOffset() >= iter.previous().startOffset()) { - // insertion will be before what next() would return (what - // we just compared against), so move back one so the insertion - // will be after. - iter.next(); - break; - } - } - iter.add(newTok); - } - - return queue.isEmpty() ? null : queue.removeFirst(); - } -}