From 0a0dcefb954966d3e67b8f9cc318aa1dcc92d8b9 Mon Sep 17 00:00:00 2001 From: David Wayne Smiley Date: Mon, 13 Apr 2015 13:35:35 +0000 Subject: [PATCH] SOLR-6692: Made standard highlighter more extensible Private methods are now protected, removed bad constructor, and refactored out getFieldValues() and getResponseForFragments() for both clarity and sub-class extension utility. Some refactoring of doHighlightingByHighlighter to clarify logic. These changes should have ZERO effect on highlights. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673183 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 2 + .../highlight/DefaultSolrHighlighter.java | 432 +++++++++--------- 2 files changed, 219 insertions(+), 215 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index b6bfeabcede..f7c73431214 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -128,6 +128,8 @@ Other Changes * SOLR-7384: Fix spurious failures in FullSolrCloudDistribCmdsTest. (shalin) +* SOLR-6692: The default highlighter is much more extensible. (David Smiley) + ================== 5.1.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release diff --git a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index 3d58134ab33..2a868cbaf5d 100644 --- a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -16,6 +16,18 @@ */ package org.apache.solr.highlight; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -58,18 +70,6 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.ListIterator; -import java.util.Map; -import java.util.Set; - /** * * @since solr 1.3 @@ -79,11 +79,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf public static Logger log = LoggerFactory.getLogger(DefaultSolrHighlighter.class); - private SolrCore solrCore; - - public DefaultSolrHighlighter() { - } + protected final SolrCore solrCore; + /** Will be invoked via reflection */ public DefaultSolrHighlighter(SolrCore solrCore) { this.solrCore = solrCore; } @@ -207,7 +205,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf * @param fieldName The name of the field * @param request The SolrQueryRequest */ - private QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) { + protected QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) { QueryScorer scorer = new QueryScorer(query, request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null); scorer.setExpandMultiTermQuery(request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true)); @@ -222,7 +220,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf * @param fieldName The name of the field * @param request The SolrQueryRequest */ - private Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) { + protected Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) { boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false); if (reqFieldMatch) { return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName); @@ -264,11 +262,11 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf protected Formatter getFormatter(String fieldName, SolrParams params ) { String str = params.getFieldParam( fieldName, HighlightParams.FORMATTER ); - SolrFormatter formatter = formatters.get( str ); + SolrFormatter formatter = formatters.get(str); if( formatter == null ) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: "+str ); } - return formatter.getFormatter( fieldName, params ); + return formatter.getFormatter(fieldName, params); } /** @@ -286,7 +284,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf if( encoder == null ) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown encoder: "+str ); } - return encoder.getEncoder( fieldName, params ); + return encoder.getEncoder(fieldName, params); } /** @@ -305,33 +303,33 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf if( frag == null ) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: "+fmt ); } - return frag.getFragmenter( fieldName, params ); + return frag.getFragmenter(fieldName, params); } protected FragListBuilder getFragListBuilder( String fieldName, SolrParams params ){ String flb = params.getFieldParam( fieldName, HighlightParams.FRAG_LIST_BUILDER ); - SolrFragListBuilder solrFlb = fragListBuilders.get( flb ); + SolrFragListBuilder solrFlb = fragListBuilders.get(flb); if( solrFlb == null ){ throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragListBuilder: " + flb ); } - return solrFlb.getFragListBuilder( params ); + return solrFlb.getFragListBuilder(params); } protected FragmentsBuilder getFragmentsBuilder( String fieldName, SolrParams params ){ BoundaryScanner bs = getBoundaryScanner(fieldName, params); - return getSolrFragmentsBuilder( fieldName, params ).getFragmentsBuilder( params, bs ); + return getSolrFragmentsBuilder( fieldName, params ).getFragmentsBuilder(params, bs); } - - private SolrFragmentsBuilder getSolrFragmentsBuilder( String fieldName, SolrParams params ){ + + protected SolrFragmentsBuilder getSolrFragmentsBuilder( String fieldName, SolrParams params ){ String fb = params.getFieldParam( fieldName, HighlightParams.FRAGMENTS_BUILDER ); - SolrFragmentsBuilder solrFb = fragmentsBuilders.get( fb ); + SolrFragmentsBuilder solrFb = fragmentsBuilders.get(fb); if( solrFb == null ){ throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmentsBuilder: " + fb ); } return solrFb; } - - private BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params){ + + protected BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params){ String bs = params.getFieldParam(fieldName, HighlightParams.BOUNDARY_SCANNER); SolrBoundaryScanner solrBs = boundaryScanners.get(bs); if(solrBs == null){ @@ -406,10 +404,10 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf * If fieldName is undefined, this method returns false, then * doHighlightingByHighlighter() will do nothing for the field. */ - private boolean useFastVectorHighlighter( SolrParams params, IndexSchema schema, String fieldName ){ - SchemaField schemaField = schema.getFieldOrNull( fieldName ); + protected boolean useFastVectorHighlighter( SolrParams params, IndexSchema schema, String fieldName ){ + SchemaField schemaField = schema.getFieldOrNull(fieldName); if( schemaField == null ) return false; - boolean useFvhParam = params.getFieldBool( fieldName, HighlightParams.USE_FVH, false ); + boolean useFvhParam = params.getFieldBool(fieldName, HighlightParams.USE_FVH, false); if( !useFvhParam ) return false; boolean termPosOff = schemaField.storeTermPositions() && schemaField.storeTermOffsets(); if( !termPosOff ) { @@ -417,187 +415,13 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf } return termPosOff; } - + @SuppressWarnings("unchecked") - private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, - int docId, StoredDocument doc, String fieldName ) throws IOException { - final SolrIndexSearcher searcher = req.getSearcher(); - final IndexSchema schema = searcher.getSchema(); - - // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - - // so we disable them until fixed (see LUCENE-3080)! - // BEGIN: Hack - final SchemaField schemaField = schema.getFieldOrNull(fieldName); - if (schemaField != null && schemaField.getType() instanceof org.apache.solr.schema.TrieField) return; - // END: Hack - + protected void doHighlightingByFastVectorHighlighter( FastVectorHighlighter highlighter, FieldQuery fieldQuery, + SolrQueryRequest req, NamedList docSummaries, int docId, StoredDocument doc, + String fieldName ) throws IOException { SolrParams params = req.getParams(); - - // preserve order of values in a multiValued list - boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); - - List allFields = doc.getFields(); - if (allFields == null || allFields.isEmpty()) return; // No explicit contract that getFields returns != null, - // although currently it can't. - - int numFragments = getMaxSnippets(fieldName, params); - boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); - - String[] summaries = null; - List frags = new ArrayList<>(); - - //Try term vectors, which is faster - TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); - final OffsetWindowTokenFilter tvWindowStream; - if (tvStream != null && schemaField.multiValued() && isActuallyMultiValued(allFields, fieldName)) { - tvWindowStream = new OffsetWindowTokenFilter(tvStream); - } else { - tvWindowStream = null; - } - - int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, - Integer.toString(Integer.MAX_VALUE))); - int mvToMatch = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, - Integer.toString(Integer.MAX_VALUE))); - - for (StorableField thisField : allFields) { - if (mvToExamine <= 0 || mvToMatch <= 0) break; - - if (! thisField.name().equals(fieldName)) continue; // Is there a better way to do this? - - --mvToExamine; - String thisText = thisField.stringValue(); - TokenStream tstream; - if (tvWindowStream != null) { - // if we have a multi-valued field with term vectors, then get the next offset window - tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length()); - } else if (tvStream != null) { - tstream = tvStream; // single-valued with term vectors - } else { - // fall back to analyzer - tstream = createAnalyzerTStream(schema, fieldName, thisText); - } - - int maxCharsToAnalyze = params.getFieldInt(fieldName, - HighlightParams.MAX_CHARS, - Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); - - Highlighter highlighter; - if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { - // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream - // needs to implement reset() efficiently. - - //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary. - // It should be okay if OffsetLimit won't get applied in this case. - final TokenStream tempTokenStream; - if (tstream != tvStream) { - if (maxCharsToAnalyze < 0) { - tempTokenStream = new CachingTokenFilter(tstream); - } else { - tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); - } - } else { - tempTokenStream = tstream; - } - - // get highlighter - highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream); - - // if the CachingTokenFilter was consumed then use it going forward. - if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter)tempTokenStream).isCached()) { - tstream = tempTokenStream; - } - //tstream.reset(); not needed; getBestTextFragments will reset it. - } - else { - // use "the old way" - highlighter = getHighlighter(query, fieldName, req); - } - - if (maxCharsToAnalyze < 0) { - highlighter.setMaxDocCharsToAnalyze(thisText.length()); - } else { - highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); - } - - try { - TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments); - for (TextFragment bestTextFragment : bestTextFragments) { - if (preserveMulti) { - if (bestTextFragment != null) { - frags.add(bestTextFragment); - --mvToMatch; - } - } else { - if ((bestTextFragment != null) && (bestTextFragment.getScore() > 0)) { - frags.add(bestTextFragment); - --mvToMatch; - } - } - } - } catch (InvalidTokenOffsetsException e) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } - }//end field value loop - - // sort such that the fragments with the highest score come first - if (!preserveMulti) { - Collections.sort(frags, new Comparator() { - @Override - public int compare(TextFragment arg0, TextFragment arg1) { - return Math.round(arg1.getScore() - arg0.getScore()); - } - }); - } - - // convert fragments back into text - // TODO: we can include score and position information in output as snippet attributes - if (frags.size() > 0) { - ArrayList fragTexts = new ArrayList<>(); - for (TextFragment fragment: frags) { - if (preserveMulti) { - if (fragment != null) { - fragTexts.add(fragment.toString()); - } - } else { - if ((fragment != null) && (fragment.getScore() > 0)) { - fragTexts.add(fragment.toString()); - } - } - - if (fragTexts.size() >= numFragments && !preserveMulti) break; - } - summaries = fragTexts.toArray(new String[0]); - if (summaries.length > 0) - docSummaries.add(fieldName, summaries); - } - // no summaries made, copy text from alternate field - if (summaries == null || summaries.length == 0) { - alternateField( docSummaries, params, doc, fieldName ); - } - } - - /** Is this field *actually* multi-valued for this document's fields? */ - private boolean isActuallyMultiValued(List allFields, String fieldName) { - boolean foundFirst = false; - for (StorableField field : allFields) { - if (field.name().equals(fieldName)) { - if (foundFirst) { - return true;//we found another - } else { - foundFirst = true; - } - } - } - return false;//0 or 1 value - } - - @SuppressWarnings("unchecked") - private void doHighlightingByFastVectorHighlighter( FastVectorHighlighter highlighter, FieldQuery fieldQuery, - SolrQueryRequest req, NamedList docSummaries, int docId, StoredDocument doc, - String fieldName ) throws IOException { - SolrParams params = req.getParams(); - SolrFragmentsBuilder solrFb = getSolrFragmentsBuilder( fieldName, params ); + SolrFragmentsBuilder solrFb = getSolrFragmentsBuilder(fieldName, params); String[] snippets = highlighter.getBestFragments( fieldQuery, req.getSearcher().getIndexReader(), docId, fieldName, params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, 100 ), params.getFieldInt( fieldName, HighlightParams.SNIPPETS, 1 ), @@ -611,9 +435,187 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf else alternateField( docSummaries, params, doc, fieldName ); } - + @SuppressWarnings("unchecked") - private void alternateField( NamedList docSummaries, SolrParams params, StoredDocument doc, String fieldName ){ + protected void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, + int docId, StoredDocument doc, String fieldName ) throws IOException { + final SolrIndexSearcher searcher = req.getSearcher(); + final IndexSchema schema = searcher.getSchema(); + final SolrParams params = req.getParams(); + + // TODO: highlighting numeric fields is broken (Lucene) - so we disable them until fixed (see LUCENE-3080)! + // BEGIN: Hack + final SchemaField schemaField = schema.getFieldOrNull(fieldName); + if (schemaField != null && schemaField.getType() instanceof org.apache.solr.schema.TrieField) return; + // END: Hack + + final int mvToExamine = + req.getParams().getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, + schemaField.multiValued() ? Integer.MAX_VALUE : 1); + // Technically this is the max *fragments* (snippets), not max values: + int mvToMatch = + req.getParams().getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE); + if (mvToExamine <= 0 || mvToMatch <= 0) { + return; + } + + int maxCharsToAnalyze = params.getFieldInt(fieldName, + HighlightParams.MAX_CHARS, + Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); + if (maxCharsToAnalyze < 0) {//e.g. -1 + maxCharsToAnalyze = Integer.MAX_VALUE; + } + + List fieldValues = getFieldValues(req, doc, fieldName, mvToExamine, maxCharsToAnalyze); + if (fieldValues.isEmpty()) { + return; + } + + // preserve order of values in a multiValued list + boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); + + int numFragments = getMaxSnippets(fieldName, params); + boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); + + List frags = new ArrayList<>(); + + //Try term vectors, which is faster + final TokenStream tvStream = + TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); + // We need to wrap in OffsetWindowTokenFilter if multi-valued + final OffsetWindowTokenFilter tvWindowStream; + if (tvStream != null && fieldValues.size() > 1) { + tvWindowStream = new OffsetWindowTokenFilter(tvStream); + } else { + tvWindowStream = null; + } + + for (String thisText : fieldValues) { + if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) { + break; + } + + TokenStream tstream; + if (tvWindowStream != null) { + // if we have a multi-valued field with term vectors, then get the next offset window + tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length()); + } else if (tvStream != null) { + tstream = tvStream; // single-valued with term vectors + } else { + // fall back to analyzer + tstream = createAnalyzerTStream(schema, fieldName, thisText); + } + + Highlighter highlighter; + if (req.getParams().getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) { + // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream + // needs to implement reset() efficiently. + + //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary. + // It should be okay if OffsetLimit won't get applied in this case. + final TokenStream tempTokenStream; + if (tstream != tvStream) { + if (maxCharsToAnalyze >= thisText.length()) { + tempTokenStream = new CachingTokenFilter(tstream); + } else { + tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); + } + } else { + tempTokenStream = tstream; + } + + // get highlighter + highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream); + + // if the CachingTokenFilter was consumed then use it going forward. + if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter)tempTokenStream).isCached()) { + tstream = tempTokenStream; + } + //tstream.reset(); not needed; getBestTextFragments will reset it. + } else { + // use "the old way" + highlighter = getHighlighter(query, fieldName, req); + } + + highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); + + // Highlight! + try { + TextFragment[] bestTextFragments = + highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments); + for (TextFragment bestTextFragment : bestTextFragments) { + if (bestTextFragment == null)//can happen via mergeContiguousFragments + continue; + // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless. + if (bestTextFragment.getScore() > 0 || preserveMulti) { + frags.add(bestTextFragment); + --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values + } + } + } catch (InvalidTokenOffsetsException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + }//end field value loop + + // Put the fragments onto the Solr response (docSummaries) + if (frags.size() > 0) { + // sort such that the fragments with the highest score come first + if (!preserveMulti) { + Collections.sort(frags, new Comparator() {//TODO make TextFragment Comparable + @Override + public int compare(TextFragment arg0, TextFragment arg1) { + return Math.round(arg1.getScore() - arg0.getScore()); + } + }); + } + + // Truncate list to hl.snippets, but not when hl.preserveMulti + if (frags.size() > numFragments && !preserveMulti) { + frags = frags.subList(0, numFragments); + } + docSummaries.add(fieldName, getResponseForFragments(frags, req)); + } else { + // no summaries made, copy text from alternate field + alternateField(docSummaries, params, doc, fieldName); + } + } + + /** Fetches field values to highlight. If the field value should come from an atypical place (or another aliased + * field name, then a subclass could override to implement that. + */ + protected List getFieldValues(SolrQueryRequest req, StoredDocument doc, String fieldName, + int maxValues, int maxCharsToAnalyze) { + // Collect the Fields we will examine (could be more than one if multi-valued) + List result = new ArrayList<>(); + for (StorableField thisField : doc.getFields()) { + if (! thisField.name().equals(fieldName)) { + continue; + } + String value = thisField.stringValue(); + result.add(value); + + maxValues--; + if (maxValues <= 0 || maxCharsToAnalyze <= 0) { + break; + } + } + return result; + } + + /** Given the fragments, return the result to be put in the field {@link NamedList}. This is an extension + * point to allow adding other metadata like the absolute offsets or scores. + */ + protected Object getResponseForFragments(List frags, SolrQueryRequest req) { + // TODO: we can include score and position information in output as snippet attributes + ArrayList fragTexts = new ArrayList<>(); + for (TextFragment fragment : frags) { + fragTexts.add(fragment.toString()); + } + return fragTexts.toArray(new String[fragTexts.size()]); + } + + @SuppressWarnings("unchecked") + protected void alternateField( NamedList docSummaries, SolrParams params, StoredDocument doc, String fieldName ){ String alternateField = params.getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD); if (alternateField != null && alternateField.length() > 0) { StorableField[] docFields = doc.getFields(alternateField); @@ -652,7 +654,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf } } - private TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException { + protected TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException { return new TokenOrderingFilter(schema.getIndexAnalyzer().tokenStream(fieldName, docText), 10); } }