From 597bd5db77465e1282ebf722264423d631861596 Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Thu, 6 Sep 2018 10:50:56 -0700 Subject: [PATCH] SOLR-9418: Added a new (experimental) PhrasesIdentificationComponent for identifying potential phrases in query input based on overlapping shingles in the index --- solr/CHANGES.txt | 5 +- .../PhrasesIdentificationComponent.java | 1129 +++++++++++++++++ .../conf/schema-phrases-identification.xml | 97 ++ .../solrconfig-phrases-identification.xml | 53 + ...stCloudPhrasesIdentificationComponent.java | 200 +++ .../PhrasesIdentificationComponentTest.java | 796 ++++++++++++ 6 files changed, 2279 insertions(+), 1 deletion(-) create mode 100644 solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java create mode 100644 solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml create mode 100644 solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml create mode 100644 solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java create mode 100644 solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 19db81ef9d7..3d947c7de0a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -1,4 +1,4 @@ - Apache Solr Release Notes + Apache Solr Release Notes Introduction ------------ @@ -208,6 +208,9 @@ New Features doc transformers if present. In 7.5 a missing 'fl' defaults to the current behavior of all fields, but in 8.0 defaults to the top/request "fl". (Moshe Bla, David Smiley) +* SOLR-9418: Added a new (experimental) PhrasesIdentificationComponent for identifying potential phrases + in query input based on overlapping shingles in the index. (Akash Mehta, Trey Grainger, hossman) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java b/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java new file mode 100644 index 00000000000..bac5a4c089a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java @@ -0,0 +1,1129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.LongSummaryStatistics; +import java.util.Map; +import java.util.TreeMap; +import java.util.stream.Collectors; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.shingle.ShingleFilterFactory; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; + +import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.client.solrj.SolrResponse; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.ShardParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.util.SolrPluginUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * A component that can be used in isolation, or in conjunction with {@link QueryComponent} to identify + * & score "phrases" found in the input string, based on shingles in indexed fields. + * + *

+ * The most common way to use this component is in conjunction with field that use + * {@link ShingleFilterFactory} on both the index and query analyzers. + * An example field type configuration would be something like this... + *

+ *
+ * <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer type="index">
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
+ *   </analyzer>
+ *   <analyzer type="query">
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ *

+ * ...where the query analyzer's maxShingleSize="7" determines the maximum + * possible phrase length that can be hueristically deduced, the index analyzer's + * maxShingleSize="3" determines the accuracy of phrases identified. The large the + * indexed maxShingleSize the higher the accuracy. Both analyzers must include + * minShingleSize="2" outputUnigrams="true". + *

+ *

+ * With a field type like this, one or more fields can be specified (with weights) via a + * phrases.fields param to request that this component identify possible phrases in the + * input q param, or an alternative phrases.q override param. The identified + * phrases will include their scores relative each field specified, as well an overal weighted score based + * on the field weights provided by the client. Higher score values indicate a greater confidence in the + * Phrase. + *

+ * + *

+ * NOTE: In a distributed request, this component uses a single phase (piggy backing on the + * {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use) to + * collect all field & shingle stats. No "refinement" requests are used. + *

+ * + * @lucene.experimental + */ +public class PhrasesIdentificationComponent extends SearchComponent { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + /** The only shard purpose that will cause this component to do work & return data during shard req */ + public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS; + + /** Name, also used as a request param to identify whether the user query concerns this component */ + public static final String COMPONENT_NAME = "phrases"; + + // TODO: ideally these should live in a commons.params class? + public static final String PHRASE_INPUT = "phrases.q"; + public static final String PHRASE_FIELDS = "phrases.fields"; + public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field"; + public static final String PHRASE_SUMMARY_PRE = "phrases.pre"; + public static final String PHRASE_SUMMARY_POST = "phrases.post"; + public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index"; + public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query"; + + @Override + public void prepare(ResponseBuilder rb) throws IOException { + final SolrParams params = rb.req.getParams(); + if (!params.getBool(COMPONENT_NAME, false)) { + return; + } + if (params.getBool(ShardParams.IS_SHARD, false)) { + // only one stage/purpose where we should do any work on a shard + if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) { + return; + } + } + + // if we're still here, then we should parse & validate our input, + // putting it in the request context so our process method knows it should do work + rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req)); + } + + @Override + public int distributedProcess(ResponseBuilder rb) { + final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass()); + if (null == contextData) { + // if prepare didn't give us anything to work with, then we should do nothing + return ResponseBuilder.STAGE_DONE; + } + + if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) { + return ResponseBuilder.STAGE_EXECUTE_QUERY; + + } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) { + // if we're being used in conjunction with QueryComponent, it should have already created + // (in this staged) the only ShardRequest we need... + for (ShardRequest sreq : rb.outgoing) { + if (0 != (SHARD_PURPOSE & sreq.purpose) ) { + return ResponseBuilder.STAGE_GET_FIELDS; + } + } + // ...if we can't find it, then evidently we're being used in isolation, + // and we need to create our own ShardRequest... + ShardRequest sreq = new ShardRequest(); + sreq.purpose = SHARD_PURPOSE; + sreq.params = new ModifiableSolrParams(rb.req.getParams()); + sreq.params.remove(ShardParams.SHARDS); + rb.addRequest(this, sreq); + return ResponseBuilder.STAGE_GET_FIELDS; + + } else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { + // NOTE: we don't do any actual work in this stage, but we need to ensure that even if + // we are being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS + // so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results + // (w/o needing extra code paths for merging phrase results when QueryComponent is/is not used) + return ResponseBuilder.STAGE_DONE; + } + + return ResponseBuilder.STAGE_DONE; + } + + @Override + public void finishStage(ResponseBuilder rb) { + // NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with + // QueryComponent, we don't want to add our results to the response until *after* + // QueryComponent adds the main DocList + + final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass()); + if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) { + // if prepare didn't give us anything to work with, or this isn't our stage, then do nothing + return; + } + + // sanity check: the shard requests we use/piggy-back on should only hapen once per shard, + // but let's future proof ourselves against the possibility that some shards might get/respond + // to the same request "purpose" multiple times... + final BitSet shardsHandled = new BitSet(rb.shards.length); + + // Collect Shard responses + for (ShardRequest sreq : rb.finished) { + if (0 != (sreq.purpose & SHARD_PURPOSE)) { + for (ShardResponse shardRsp : sreq.responses) { + final int shardNum = rb.getShardNum(shardRsp.getShard()); + if (! shardsHandled.get(shardNum)) { + shardsHandled.set(shardNum); + // shards.tolerant=true can cause nulls on exceptions/errors + // if we don't get phrases/stats from a shard, just ignore that shard + final SolrResponse rsp = shardRsp.getSolrResponse(); + if (null == rsp) continue; + final NamedList top = rsp.getResponse(); + if (null == top) continue; + final NamedList phrasesWrapper = (NamedList) top.get("phrases"); + if (null == phrasesWrapper) continue; + final List> shardPhrases = (List>) phrasesWrapper.get("_all"); + if (null == shardPhrases) continue; + + Phrase.populateStats(contextData.allPhrases, shardPhrases); + } + } + } + } + scoreAndAddResultsToResponse(rb, contextData); + } + + + @Override + public void process(ResponseBuilder rb) throws IOException { + final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass()); + if (null == contextData) { + // if prepare didn't give us anything to work with, then we should do nothing + return; + } + + // regardless of single node / shard, we need local stats... + Phrase.populateStats(contextData.allPhrases, contextData.fieldWeights.keySet(), rb.req.getSearcher()); + + if ( rb.req.getParams().getBool(ShardParams.IS_SHARD, false) ) { + // shard request, return stats for all phrases (in original order) + SimpleOrderedMap output = new SimpleOrderedMap<>(); + output.add("_all", Phrase.formatShardResponse(contextData.allPhrases)); + // TODO: might want to add numDocs() & getSumTotalTermFreq(f)/getDocCount(f) stats from each field... + // so that we can sum/merge them for use in scoring? + rb.rsp.add("phrases", output); + } else { + // full single node request... + scoreAndAddResultsToResponse(rb, contextData); + } + } + + /** + * Helper method (suitable for both single node & distributed coordinator node) to + * score, sort, and format the end user response once all phrases have been populated with stats. + */ + private void scoreAndAddResultsToResponse(final ResponseBuilder rb, final PhrasesContextData contextData) { + assert null != contextData : "Should not be called if no phrase data to use"; + if (null == contextData) { + // if prepare didn't give us anything to work with, then we should do nothing + return; + } + + SimpleOrderedMap output = new SimpleOrderedMap<>(); + rb.rsp.add("phrases", output); + output.add("input", contextData.rawInput); + + if (0 == contextData.allPhrases.size()) { + // w/o any phrases, the summary is just the input again... + output.add("summary", contextData.rawInput); + output.add("details", Collections.emptyList()); + return; + } + + Phrase.populateScores(contextData); + final int maxPosition = contextData.allPhrases.get(contextData.allPhrases.size()-1).getPositionEnd(); + + final List validScoringPhrasesSorted = contextData.allPhrases.stream() + // TODO: ideally this cut off of "0.0" should be a request option... + // so users can tune how aggresive/conservative they want to be in finding phrases + // but for that to be useful, we need: + // - more hard & fast documentation about the "range" of scores that may be returned + // - "useful" scores for single words + .filter(p -> 0.0D < p.getTotalScore()) + .sorted(Comparator.comparing((p -> p.getTotalScore()), Collections.reverseOrder())) + .collect(Collectors.toList()); + + // we want to return only high scoring phrases that don't overlap w/higher scoring phrase + final BitSet positionsCovered = new BitSet(maxPosition+1); + final List results = new ArrayList<>(maxPosition); + for (Phrase phrase : validScoringPhrasesSorted) { + final BitSet phrasePositions = phrase.getPositionsBitSet(); + + if (! phrasePositions.intersects(positionsCovered)) { + // we can use this phrase, record it... + positionsCovered.or(phrasePositions); + results.add(phrase); + } // else: overlaps higher scoring position(s), skip this phrase + + if (positionsCovered.cardinality() == maxPosition+1) { + // all positions are covered, so we can bail out and skip the rest + break; + } + } + + // a "quick summary" of the suggested parsing + output.add("summary", contextData.summarize(results)); + // useful user level info on every (high scoring) phrase found (in current, descending score, order) + output.add("details", results.stream() + .map(p -> p.getDetails()).collect(Collectors.toList())); + } + + @Override + public String getDescription() { + return "Phrases Identification Component"; + } + + /** + * Simple container for all request options and data this component needs to store in the Request Context + * @lucene.internal + */ + public static final class PhrasesContextData { + + public final String rawInput; + public final int maxIndexedPositionLength; + public final int maxQueryPositionLength; + public final Map fieldWeights; + public final SchemaField analysisField; + public final List allPhrases; + public final String summaryPre; + public final String summaryPost; + + // TODO: add an option to bias field weights based on sumTTF of the fields + // (easy enough to "sum the sums" across multiple shards before scoring) + + /** + * Parses the params included in this request, throwing appropriate user level + * Exceptions for invalid input, and returning a PhrasesContextData + * suitable for use in this request. + */ + public static PhrasesContextData parseAndValidateRequest(final SolrQueryRequest req) throws SolrException { + return new PhrasesContextData(req); + } + private PhrasesContextData(final SolrQueryRequest req) throws SolrException { + final SolrParams params = req.getParams(); + + this.rawInput = params.get(PHRASE_INPUT, params.get(CommonParams.Q)); + if (null == this.rawInput) { + throw new SolrException(ErrorCode.BAD_REQUEST, "phrase identification requires a query string or " + + PHRASE_INPUT + " param override"); + } + + { // field weights & analysis field... + + SchemaField tmpAnalysisField = null; + Map tmpWeights = new TreeMap<>(); + + final String analysisFieldName = params.get(PHRASE_ANALYSIS_FIELD); + if (null != analysisFieldName) { + tmpAnalysisField = req.getSchema().getFieldOrNull(analysisFieldName); + if (null == tmpAnalysisField) { + throw new SolrException(ErrorCode.BAD_REQUEST, + PHRASE_ANALYSIS_FIELD + " param specifies a field name that does not exist: " + + analysisFieldName); + } + } + + final Map rawFields = SolrPluginUtils.parseFieldBoosts(params.getParams(PHRASE_FIELDS)); + if (rawFields.isEmpty()) { + throw new SolrException(ErrorCode.BAD_REQUEST, + PHRASE_FIELDS + " param must specify a (weighted) list of fields " + + "to evaluate for phrase identification"); + } + + for (Map.Entry entry : rawFields.entrySet()) { + final SchemaField field = req.getSchema().getFieldOrNull(entry.getKey()); + if (null == field) { + throw new SolrException(ErrorCode.BAD_REQUEST, + PHRASE_FIELDS + " param contains a field name that does not exist: " + + entry.getKey()); + } + if (null == tmpAnalysisField) { + tmpAnalysisField = field; + } + if ( null == analysisFieldName ) { + if (! field.getType().equals(tmpAnalysisField.getType())) { + throw new SolrException + (ErrorCode.BAD_REQUEST, + "All fields specified in " + PHRASE_FIELDS + " must have the same fieldType, " + + "or the advanced " + PHRASE_ANALYSIS_FIELD + " option must specify an override"); + } + } + // if a weight isn't specified, assume "1.0" + final double weight = null == entry.getValue() ? 1.0D : entry.getValue(); + if (weight < 0) { + throw new SolrException(ErrorCode.BAD_REQUEST, + PHRASE_FIELDS + " param must use non-negative weight value for field " + field.getName()); + } + tmpWeights.put(entry.getKey(), weight); + } + assert null != tmpAnalysisField; + + this.analysisField = tmpAnalysisField; + this.fieldWeights = Collections.unmodifiableMap(tmpWeights); + } + + { // index/query max phrase sizes... + final FieldType ft = analysisField.getType(); + this.maxIndexedPositionLength = req.getParams().getInt(PHRASE_INDEX_MAXLEN, + getMaxShingleSize(ft.getIndexAnalyzer())); + if (this.maxIndexedPositionLength < 0) { + throw new SolrException(ErrorCode.BAD_REQUEST, + "Unable to determine max position length of indexed phrases using " + + "index analyzer for analysis field: " + analysisField.getName() + + " and no override detected using param: " + PHRASE_INDEX_MAXLEN); + } + this.maxQueryPositionLength = req.getParams().getInt(PHRASE_QUERY_MAXLEN, + getMaxShingleSize(ft.getQueryAnalyzer())); + if (this.maxQueryPositionLength < 0) { + throw new SolrException(ErrorCode.BAD_REQUEST, + "Unable to determine max position length of query phrases using " + + "query analyzer for analysis field: " + analysisField.getName() + + " and no override detected using param: " + PHRASE_QUERY_MAXLEN); + } + if (this.maxQueryPositionLength < this.maxIndexedPositionLength) { + throw new SolrException + (ErrorCode.BAD_REQUEST, + "Effective value of " + PHRASE_INDEX_MAXLEN + " (either from index analyzer shingle factory, " + + " or expert param override) must be less then or equal to the effective value of " + + PHRASE_QUERY_MAXLEN + " (either from query analyzer shingle factory, or expert param override)"); + } + } + + this.summaryPre = params.get(PHRASE_SUMMARY_PRE, "{"); + this.summaryPost = params.get(PHRASE_SUMMARY_POST, "}"); + + this.allPhrases = Phrase.extractPhrases(this.rawInput, this.analysisField, + this.maxIndexedPositionLength, + this.maxQueryPositionLength); + + } + + /** + * Given a list of phrases to be returned to the user, summarizes those phrases by decorating the + * original input string to indicate where the identified phrases exist, using {@link #summaryPre} + * and {@link #summaryPost} + * + * @param results a list of (non overlapping) Phrases that have been identified, sorted from highest scoring to lowest + * @return the original user input, decorated to indicate the identified phrases + */ + public String summarize(final List results) { + final StringBuffer out = new StringBuffer(rawInput); + + // sort by *reverse* position so we can go back to front + final List reversed = results.stream() + .sorted(Comparator.comparing((p -> p.getPositionStart()), Collections.reverseOrder())) + .collect(Collectors.toList()); + + for (Phrase p : reversed) { + out.insert(p.getOffsetEnd(), summaryPost); + out.insert(p.getOffsetStart(), summaryPre); + } + return out.toString(); + } + } + + + /** + * Model the data known about a single (candidate) Phrase -- which may or may not be indexed + * @lucene.internal + */ + public static final class Phrase { + + /** + * Factory method for constructing a list of Phrases given the specified input and using the analyzer + * for the specified field. The maxIndexedPositionLength and + * maxQueryPositionLength provided *must* match the effective values used by + * respective analyzers. + */ + public static List extractPhrases(final String input, final SchemaField analysisField, + final int maxIndexedPositionLength, + final int maxQueryPositionLength) { + + // TODO: rather then requiring the query analyzer to produce the Phrases for us (assuming Shingles) + // we could potentially just require that it produces unigrams compatible with the unigrams in the + // indexed fields, and then build our own Phrases at query time -- making the maxQueryPositionLength + // a 100% run time configuration option. + // But that could be tricky given an arbitrary analyzer -- we'd have pay careful attention + // to positions, and we'd have to guess/assume what placeholders/fillers was used in the indexed Phrases + // (typically shingles) + + assert maxIndexedPositionLength <= maxQueryPositionLength; + + final CharsRefBuilder buffer = new CharsRefBuilder(); + final FieldType ft = analysisField.getType(); + final Analyzer analyzer = ft.getQueryAnalyzer(); + final List results = new ArrayList<>(42); + try (TokenStream tokenStream = analyzer.tokenStream(analysisField.getName(), input)) { + + final OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class); + final PositionIncrementAttribute posIncAttr = tokenStream.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLenAttr = tokenStream.addAttribute(PositionLengthAttribute.class); + final TermToBytesRefAttribute termAttr = tokenStream.addAttribute(TermToBytesRefAttribute.class); + + int position = 0; + int lastPosLen = -1; + + tokenStream.reset(); + while (tokenStream.incrementToken()) { + final Phrase phrase = new Phrase(); + + final int posInc = posIncAttr.getPositionIncrement(); + final int posLen = posLenAttr.getPositionLength(); + + if (0 == posInc && posLen <= lastPosLen) { + // This requirement of analyzers to return tokens in ascending order of length + // is currently neccessary for the "linking" logic below to work + // if people run into real world sitautions where this is problematic, + // we can relax this check if we also make the linking logic more complex + // (ie: less optimzied) + throw new SolrException + (ErrorCode.BAD_REQUEST, "Phrase identification currently requires that " + + "the analyzer used must produce tokens that overlap in increasing order of length. "); + } + + position += posInc; + lastPosLen = posLen; + + phrase.position_start = position; + phrase.position_end = position + posLen; + + phrase.is_indexed = (posLen <= maxIndexedPositionLength); + + phrase.offset_start = offsetAttr.startOffset(); + phrase.offset_end = offsetAttr.endOffset(); + + // populate the subsequence directly from the raw input using the offsets, + // (instead of using the TermToBytesRefAttribute) so we preserve the original + // casing, whitespace, etc... + phrase.subSequence = input.subSequence(phrase.offset_start, phrase.offset_end); + + if (phrase.is_indexed) { + // populate the bytes so we can build term queries + phrase.bytes = BytesRef.deepCopyOf(termAttr.getBytesRef()); + } + + results.add(phrase); + } + tokenStream.end(); + } catch (IOException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Analysis error extracting phrases from: " + input, e); + } + + // fill in the relationships of each phrase + // + // NOTE: this logic currently requries that the phrases are sorted by position ascending + // (automatic because of how PositionIncrementAttribute works) then by length ascending + // (when positions are tied). + // We could de-optimize this code if we find that secondary ordering is too restrictive for + // some analyzers + // + // NOTE changes to scoring model may be allow optimize/prune down the relationships tracked, + // ...OR.... may require us to add/track more details about sub/parent phrases + // + for (int p = 0; p < results.size(); p++) { + final Phrase current = results.get(p); + if (! current.is_indexed) { + // we're not an interesting sub phrase of anything + continue; + } + + // setup links from the phrase to itself if needed + addLinkages(current, current, maxIndexedPositionLength); + + // scan backwards looking for phrases that might include us... + BEFORE: for (int i = p-1; 0 <= i; i--) { + final Phrase previous = results.get(i); + if (previous.position_start < (current.position_end - maxQueryPositionLength)) { + // we've scanned so far back nothing else is viable + break BEFORE; + } + // any 'previous' phrases must start where current starts or earlier, + // so only need to check the end... + if (current.position_end <= previous.position_end) { + addLinkages(previous, current, maxIndexedPositionLength); + } + } + // scan forwards looking for phrases that might include us... + AFTER: for (int i = p+1; i < results.size(); i++) { + final Phrase next = results.get(i); + // the only way a phrase that comes after current can include current is + // if they have the same start position... + if (current.position_start != next.position_start) { + // we've scanned so far forward nothing else is viable + break AFTER; + } + // any 'next' phrases must start where current starts, so only need to check the end... + if (current.position_end <= next.position_end) { + addLinkages(next, current, maxIndexedPositionLength); + } + } + } + + return Collections.unmodifiableList(results); + } + + /** + * Given two phrases, one of which is a super set of the other, adds the neccessary linkages + * needed by the scoring model + */ + private static void addLinkages(final Phrase outer, final Phrase inner, + final int maxIndexedPositionLength) { + + assert outer.position_start <= inner.position_start; + assert inner.position_end <= outer.position_end; + assert inner.is_indexed; + + final int inner_len = inner.getPositionLength(); + if (1 == inner_len) { + outer.individualIndexedTerms.add(inner); + } + if (maxIndexedPositionLength == inner_len + || (inner == outer && inner_len < maxIndexedPositionLength)) { + outer.largestIndexedSubPhrases.add(inner); + } + if (outer.is_indexed && inner != outer) { + inner.indexedSuperPhrases.add(outer); + } + } + + /** + * Format the phrases suitable for returning in a shard response + * @see #populateStats(List,List) + */ + public static List> formatShardResponse(final List phrases) { + List> results = new ArrayList<>(phrases.size()); + for (Phrase p : phrases) { + NamedList data = new SimpleOrderedMap<>(); + // quick and dirty way to validate that our shards aren't using different analyzers + // so the coordinating node can fail fast when mergingthe results + data.add("checksum", p.getChecksum()); + if (p.is_indexed) { + data.add("ttf", new NamedList(p.phrase_ttf)); + data.add("df", new NamedList(p.phrase_df)); + } + data.add("conj_dc", new NamedList(p.subTerms_conjunctionCounts)); + + results.add(data); + } + return results; + } + + /** + * Populates the phrases with (merged) stats from a remote shard + * @see #formatShardResponse + */ + public static void populateStats(final List phrases, final List> shardData) { + final int numPhrases = phrases.size(); + if (shardData.size() != numPhrases) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "num phrases in shard data not consistent: " + + numPhrases + " vs " + shardData.size()); + } + for (int i = 0; i < phrases.size(); i++) { + // rather then being paranoid about the expected structure, we'll just let the low level + // code throw an NPE / CCE / AIOOBE / etc. and wrap & rethrow later... + try { + final Phrase p = phrases.get(i); + final NamedList data = shardData.get(i); + // sanity check the correct phrase + if (! p.getChecksum().equals(data.get("checksum"))) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "phrase #" + i + " in shard data had invalid checksum"); + } + if (p.is_indexed) { + for (Map.Entry ttf : (NamedList) data.get("ttf")) { + p.phrase_ttf.merge(ttf.getKey(), ttf.getValue(), Long::sum); + } + for (Map.Entry df : (NamedList) data.get("df")) { + p.phrase_df.merge(df.getKey(), df.getValue(), Long::sum); + } + } + for (Map.Entry conj_dc : (NamedList) data.get("conj_dc")) { + p.subTerms_conjunctionCounts.merge(conj_dc.getKey(), conj_dc.getValue(), Long::sum); + } + } catch (RuntimeException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "shard data for phrase#" + i + " not consistent", e); + } + } + } + + /** + * Populates the phrases with stats from the local index for the specified fields + */ + public static void populateStats(final List phrases, final Collection fieldNames, + final SolrIndexSearcher searcher) throws IOException { + final IndexReader reader = searcher.getIndexReader(); + for (String field : fieldNames) { + for (Phrase phrase : phrases) { + if (phrase.is_indexed) { + // add stats based on this entire phrase as an indexed term + final Term t = new Term(field, phrase.bytes); + phrase.phrase_ttf.put(field, reader.totalTermFreq(t)); + phrase.phrase_df.put(field, (long)reader.docFreq(t)); + } + + // even if our phrase is too long to be indexed whole, add stats based on the + // conjunction of all the individual terms in the phrase + List filters = new ArrayList<>(phrase.individualIndexedTerms.size()); + for (Phrase term : phrase.individualIndexedTerms) { + // trust the SolrIndexSearcher to cache & intersect the individual terms so that this + // can be efficient regardless of how often terms are re-used multiple times in the input/phrases + filters.add(new TermQuery(new Term(field, term.bytes))); + } + final long count = searcher.getDocSet(filters).size(); + phrase.subTerms_conjunctionCounts.put(field, count); + } + } + } + + /** + * Uses the previously popuated stats to populate each Phrase with it's scores for the specified fields, + * and it's over all (weighted) total score. This is not needed on shard requests. + * + * @see #populateStats + * @see #getFieldScore(String) + * @see #getTotalScore + */ + public static void populateScores(final PhrasesContextData contextData) { + populateScores(contextData.allPhrases, contextData.fieldWeights, + contextData.maxIndexedPositionLength, + contextData.maxQueryPositionLength); + } + + /** + * Public for testing purposes + * @see #populateScores(PhrasesIdentificationComponent.PhrasesContextData) + * @lucene.internal + */ + public static void populateScores(final List phrases, final Map fieldWeights, + final int maxIndexedPositionLength, + final int maxQueryPositionLength) { + final double total_weight = fieldWeights.values().stream().mapToDouble(Double::doubleValue).sum(); + for (Phrase phrase : phrases) { + double phrase_cumulative_score = 0.0D; + for (Map.Entry entry : fieldWeights.entrySet()) { + final String field = entry.getKey(); + final double weight = entry.getValue(); + double field_score = computeFieldScore(phrase, field, + maxIndexedPositionLength, maxQueryPositionLength); + phrase.fieldScores.put(field,field_score); + phrase_cumulative_score += (field_score * weight); + } + phrase.total_score = (total_weight < 0 ? Double.NEGATIVE_INFINITY + : (phrase_cumulative_score / total_weight)); + } + } + + private Phrase() { + // No-Op + } + + private boolean is_indexed; + private double total_score = -1.0D; // until we get a computed score, this is "not a phrase" + + private CharSequence subSequence; + private BytesRef bytes; + private int offset_start; + private int offset_end; + private int position_start; + private int position_end; + private Integer checksum = null; + + /** NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves */ + private final List individualIndexedTerms = new ArrayList<>(7); + /** + * NOTE: Indexed phrases of length less then the max indexed length are the (sole) + * largest sub-phrases of themselves + */ + private final List largestIndexedSubPhrases = new ArrayList<>(7); + /** Phrases larger then this phrase which are indexed and fully contain it */ + private final List indexedSuperPhrases = new ArrayList<>(7); + + // NOTE: keys are field names + private final Map subTerms_conjunctionCounts = new TreeMap<>(); + private final Map phrase_ttf = new TreeMap<>(); + private final Map phrase_df = new TreeMap<>(); + private final Map fieldScores = new TreeMap<>(); + + public String toString() { + return "'" + subSequence + "'" + + "[" + offset_start + ":" + offset_end + "]" + + "[" + position_start + ":" + position_end + "]"; + } + + public NamedList getDetails() { + SimpleOrderedMap out = new SimpleOrderedMap(); + out.add("text", subSequence); + out.add("offset_start", getOffsetStart()); + out.add("offset_end", getOffsetEnd()); + out.add("score", getTotalScore()); + out.add("field_scores", fieldScores); + return out; + } + + /** + * Computes & caches the checksum of this Phrase (if not already cached). + * needed only when merging shard data to validate no inconsistencies with the remote shards + */ + private Integer getChecksum() { + if (null == checksum) { + checksum = Arrays.hashCode(new int[] { offset_start, offset_end, position_start, position_end }); + } + return checksum; + } + /** The characters from the original input that corrispond with this Phrase */ + public CharSequence getSubSequence() { + return subSequence; + } + + /** + * Returns the list of "individual" (ie: getPositionLength()==1 terms. + * NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves + */ + public List getIndividualIndexedTerms() { + return individualIndexedTerms; + } + /** + * Returns the list of (overlapping) sub phrases that have the largest possible size based on + * the effective value of {@link PhrasesContextData#maxIndexedPositionLength}. + * NOTE: Indexed phrases of length less then the max indexed length are the (sole) + * largest sub-phrases of themselves. + */ + public List getLargestIndexedSubPhrases() { + return largestIndexedSubPhrases; + } + /** + * Returns all phrases larger then this phrase, which fully include this phrase, and are indexed. + * NOTE: A Phrase is never the super phrase of itself. + */ + public List getIndexedSuperPhrases() { + return indexedSuperPhrases; + } + + /** NOTE: positions start at '1' */ + public int getPositionStart() { + return position_start; + } + /** NOTE: positions start at '1' */ + public int getPositionEnd() { + return position_end; + } + public int getPositionLength() { + return position_end - position_start; + } + /** Each set bit identifies a position filled by this Phrase */ + public BitSet getPositionsBitSet() { + final BitSet result = new BitSet(); + result.set(position_start, position_end); + return result; + } + public int getOffsetStart() { + return offset_start; + } + public int getOffsetEnd() { + return offset_end; + } + + /** + * Returns the overall score for this Phrase. In the current implementation, + * the only garuntee made regarding the range of possible values is that 0 (or less) means + * it is not a good phrase. + * + * @return A numeric value indicating the confidence in this Phrase, higher numbers are higher confidence. + */ + public double getTotalScore() { + return total_score; + } + /** + * Returns the score for this Phrase in this given field. In the current implementation, + * the only garuntee made regarding the range of possible values is that 0 (or less) means + * it is not a good phrase. + * + * @return A numeric value indicating the confidence in this Phrase for this field, higher numbers are higher confidence. + */ + public double getFieldScore(String field) { + return fieldScores.getOrDefault(field, -1.0D); + } + + /** + * Returns the number of total TTF of this (indexed) Phrase as term in the specified field. + * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats} + * methods has been called with this field. + */ + public long getTTF(String field) { + if (!is_indexed) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "TTF is only available for indexed phrases"); + } + return phrase_ttf.getOrDefault(field, 0L); + } + /** + * Returns the number of documents that contain all of the {@link #getIndividualIndexedTerms} + * that make up this Phrase, in the specified field. + * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats} + * methods has been called with this field. + */ + public long getConjunctionDocCount(String field) { + return subTerms_conjunctionCounts.getOrDefault(field, 0L); + } + /** + * Returns the number of documents that contain this (indexed) Phrase as term + * in the specified field. + * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats} + * methods has been called with this field. + */ + public long getDocFreq(String field) { + if (!is_indexed) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "DF is only available for indexed phrases"); + } + return phrase_df.getOrDefault(field, 0L); + } + + /** + * Uses the previously popuated stats to compute a score for the specified field. + * + *

+ * The current implementation returns scores in the range of [0,1], but this + * may change in future implementations. The only current garuntees are: + *

+ * + *
    + *
  • 0 (or less) means this is garunteed to not be a phrase
  • + *
  • larger numbers are higher confidence
  • + * + * + * @see #populateStats + * @see #populateScores + * @see #getFieldScore(String) + * @return a score value + */ + private static double computeFieldScore(final Phrase input, + final String field, + final int maxIndexedPositionLength, + final int maxQueryPositionLength) { + final long num_indexed_sub_phrases = input.getLargestIndexedSubPhrases().size(); + assert 0 <= num_indexed_sub_phrases; // should be impossible + + if (input.getIndividualIndexedTerms().size() < input.getPositionLength()) { + // there are "gaps" in our input, where individual words have not been indexed (stop words, + // or multivalue position gap) which means we are not a viable candidate for being a valid Phrase. + return -1.0D; + } + + final long phrase_conj_count = input.getConjunctionDocCount(field); + // if there isn't a single document containing all the terms in our + // phrase, then it is 100% not a phrase + if (phrase_conj_count <= 0) { + return -1.0D; + } + + // single words automatically score 0.0 (unless they already scored less for not existing + if (input.getPositionLength() <= 1) { + return 0.0D; + } + + double field_score = 0.0D; + long max_sub_conj_count = phrase_conj_count; + + // At the moment, the contribution of each "words" sub-Phrase to the field score to the input + // Phrase is independent of any context of "input". Depending on if/how sub-phrase scoring + // changes, we might consider computing the scores of all the indexed phrases first, and + // aching the portions of their values that are re-used when computing the scores of + // longer phrases? + // + // This would make the overall scoring of all phrases a lot more complicated, + // but could save CPU cycles? + // (particularly when maxIndexedPositionLength <<< maxQueryPositionLength ???) + // + // My gut says that knowing the conj_count(input) "context" should help us score the + // sub-phrases better, but i can't yet put my finger on why/how. maybe by comparing + // the conj_count(input) to the max(conj_count(parent of words)) ? + + // for each of the longest indexed phrases, aka indexed sub-sequence of "words", we have... + for (Phrase words : input.getLargestIndexedSubPhrases()) { + // we're going to compute scores in range of [-1:1] to indicate the likelihood that our + // "words" should be used as a "phrase", based on a bayesian document categorization model, + // where the "words as a phrase" (aka: phrase) is our candidate category. + // + // P(words|phrase) * P(phrase) - P(words|not phrase) * P(not phrase) + // + // Where... + // P(words|phrase) = phrase_ttf / min(word_ttf) + // P(phrase) =~ phrase_docFreq / conj_count(words in phrase) *SEE NOTE BELOW* + // P(words|not phrase) = phrase_ttf / max(word_ttf) + // P(not a phrase) = 1 - P(phrase) + // + // ... BUT! ... + // + // NOTE: we're going to reduce our "P(phrase) by the max "P(phrase)" of all the (indexed) + // candidate phrases we are a sub-phrase of, to try to offset the inherent bias in favor + // of small indexed phrases -- because anytime the super-phrase exists, the sub-phrase exists + + + // IDEA: consider replacing this entire baysian model with LLR (or rootLLR)... + // http://mahout.apache.org/docs/0.13.0/api/docs/mahout-math/org/apache/mahout/math/stats/LogLikelihood.html + // ...where we compute LLR over each of the TTF of the pairs of adjacent sub-phrases of each + // indexed phrase and take the min|max|avg of the LLR scores. + // + // ie: for indexed shingle "quick brown fox" compute LLR(ttf("quick"), ttf("brown fox")) & + // LLR(ttf("quick brown"), ttf("fox")) using ttf("quick brown fox") as the co-occurance + // count, and sumTTF-ttf("quick")-ttf("brown")-ttf("fox") as the "something else" + // + // (we could actually compute LLR stats over TTF and DF and combine them) + // + // NOTE: Going the LLR/rootLLR route would require building a full "tree" of every (indexed) + // sub-phrase of every other phrase (or at least: all siblings of diff sizes that add up to + // an existing phrase). As well as require us to give up on a predictible "range" of + // legal values for scores (IIUC from the LLR docs) + + final long phrase_ttf = words.getTTF(field); + final long phrase_df = words.getDocFreq(field); + final long words_conj_count = words.getConjunctionDocCount(field); + max_sub_conj_count = Math.max(words_conj_count, max_sub_conj_count); + + final double max_wrapper_phrase_probability = + words.getIndexedSuperPhrases().stream() + .mapToDouble(p -> p.getConjunctionDocCount(field) <= 0 ? + // special case check -- we already know *our* conj count > 0, + // but we need a similar check for wrapper phrases: if <= 0, their probability is 0 + 0.0D : ((double)p.getDocFreq(field) / p.getConjunctionDocCount(field))).max().orElse(0.0D); + + final LongSummaryStatistics words_ttfs = + words.getIndividualIndexedTerms().stream() + .collect(Collectors.summarizingLong(t -> t.getTTF(field))); + + final double words_phrase_prob = (phrase_ttf / (double)words_ttfs.getMin()); + final double words_not_phrase_prob = (phrase_ttf / (double)words_ttfs.getMax()); + + final double phrase_prob = (phrase_conj_count / (double)words_conj_count); + + + final double phrase_score = words_phrase_prob * (phrase_prob - max_wrapper_phrase_probability); + final double not_phrase_score = words_not_phrase_prob * (1 - (phrase_prob - max_wrapper_phrase_probability)); + final double words_score = phrase_score - not_phrase_score; + + field_score += words_score; + } + + // NOTE: the "scaling" factors below can "increase" negative scores (by reducing the unsigned value) + // when they should ideally be penalizing the scores further, but since we currently don't care + // about any score lower then 0, it's not worth worrying about. + + // Average the accumulated score over the number of actual indexed sub-phrases that contributed + // + // NOTE: since we subsequently want to multiply the score by a fraction with num_indexed_sub_phrases + // in the numerator, we can skip this... + // SEE BELOW // field_score /= (double) num_indexed_sub_phrases; + + // If we leave field_score as is, then a phrase longer then the maxIndexedPositionLength + // will never score higher then the highest scoring sub-phrase it has (because we've averaged them) + // so we scale the scores against the longest possible phrase length we're considering + // + // NOTE: We don't use num_indexed_sub_phrases in the numerator since we skipped it when + // averating above... + field_score *= ( 1.0D // SEE ABOVE // * ( (double)num_indexed_sub_phrases ) + / (1 + maxQueryPositionLength - maxIndexedPositionLength) ); + + // scale the field_score based on the ratio of the conjunction docCount for the whole phrase + // realtive to the largest conjunction docCount of it's (largest indexed) sub phrases, to penalize + // the scores of very long phrases that exist very rarely relative to the how often their + // sub phrases exist in the index + field_score *= ( ((double) phrase_conj_count) / max_sub_conj_count); + + return field_score; + } + } + + /** + * Helper method, public for testing purposes only. + *

    + * Given an analyzer, inspects it to determine if: + *

      + *
    • it is a {@link TokenizerChain}
    • + *
    • it contains exactly one instance of {@link ShingleFilterFactory}
    • + *
    + *

    + * If these these conditions are met, then this method returns the maxShingleSize + * in effect for this analyzer, otherwise returns -1. + *

    + * + * @param analyzer An analyzer inspect + * @return maxShingleSize if available + * @lucene.internal + */ + public static int getMaxShingleSize(Analyzer analyzer) { + if (!TokenizerChain.class.isInstance(analyzer)) { + return -1; + } + + final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories(); + if (0 == factories.length) { + return -1; + } + int result = -1; + for (TokenFilterFactory tff : factories) { + if (ShingleFilterFactory.class.isInstance(tff)) { + if (0 < result) { + // more then one shingle factory in our analyzer, which is weird, so make no assumptions... + return -1; + } + // would be nice if there was an easy way to just ask a factory for the effective value + // of an arguement... + final Map args = tff.getOriginalArgs(); + result = args.containsKey("maxShingleSize") + ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; + } + } + return result; + } +} diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml b/solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml new file mode 100644 index 00000000000..ab38f9fee3b --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml new file mode 100644 index 00000000000..65ccd5e0965 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml @@ -0,0 +1,53 @@ + + + + + ${tests.luceneMatchVersion:LATEST} + + + + + + + + + phrases + + + explicit + true + body + multigrams_body multigrams_title^2 + + + + + + + phrases + + + explicit + true + true + multigrams_body multigrams_title^2 + + + + + diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java new file mode 100644 index 00000000000..cbe1cdce946 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.lucene.util.TestUtil; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; + +import org.junit.AfterClass; +import org.junit.BeforeClass; + +/** + * A very simple sanity check that Phrase Identification works across a cloud cluster + * using distributed term stat collection. + * + * @see org.apache.solr.handler.component.PhrasesIdentificationComponentTest + */ +@Slow +public class TestCloudPhrasesIdentificationComponent extends SolrCloudTestCase { + + private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName(); + private static final String COLLECTION_NAME = DEBUG_LABEL + "_collection"; + + /** A basic client for operations at the cloud level, default collection will be set */ + private static CloudSolrClient CLOUD_CLIENT; + /** One client per node */ + private static ArrayList CLIENTS = new ArrayList<>(5); + + @BeforeClass + private static void createMiniSolrCloudCluster() throws Exception { + + // multi replicas should not matter... + final int repFactor = usually() ? 1 : 2; + // ... but we definitely want to test multiple shards + final int numShards = TestUtil.nextInt(random(), 1, (usually() ? 2 :3)); + final int numNodes = (numShards * repFactor); + + final String configName = DEBUG_LABEL + "_config-set"; + final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf"); + + configureCluster(numNodes).addConfig(configName, configDir).configure(); + + Map collectionProperties = new LinkedHashMap<>(); + collectionProperties.put("config", "solrconfig-phrases-identification.xml"); + collectionProperties.put("schema", "schema-phrases-identification.xml"); + CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor) + .setProperties(collectionProperties) + .process(cluster.getSolrClient()); + + CLOUD_CLIENT = cluster.getSolrClient(); + CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME); + + waitForRecoveriesToFinish(CLOUD_CLIENT); + + for (JettySolrRunner jetty : cluster.getJettySolrRunners()) { + CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/")); + } + + // index some docs... + CLOUD_CLIENT.add + (sdoc("id", "42", + "title","Tale of the Brown Fox: was he lazy?", + "body", "No. The quick brown fox was a very brown fox who liked to get into trouble.")); + CLOUD_CLIENT.add + (sdoc("id", "43", + "title","A fable in two acts", + "body", "The brOwn fOx jumped. The lazy dog did not")); + CLOUD_CLIENT.add + (sdoc("id", "44", + "title","Why the LazY dog was lazy", + "body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox")); + CLOUD_CLIENT.add + (sdoc("id", "45", + "title","Why Are We Lazy?", + "body", "Because we are. that's why")); + CLOUD_CLIENT.commit(); + } + + @AfterClass + private static void afterClass() throws Exception { + CLOUD_CLIENT.close(); CLOUD_CLIENT = null; + for (HttpSolrClient client : CLIENTS) { + client.close(); + } + CLIENTS = null; + } + + public void testBasicPhrases() throws Exception { + final String input = " did a Quick brown FOX perniciously jump over the lazy dog"; + final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}"; + + // based on the documents indexed, these assertions should all pass regardless of + // how many shards we have, or wether the request is done via /phrases or /select... + for (String path : Arrays.asList("/select", "/phrases")) { + // ... or if we muck with "q" and use the alternative phrases.q for the bits we care about... + for (SolrParams p : Arrays.asList(params("q", input, "phrases", "true"), + params("q", "*:*", "phrases.q", input, "phrases", "true"), + params("q", "-*:*", "phrases.q", input, "phrases", "true"))) { + final QueryRequest req = new QueryRequest(p); + req.setPath(path); + final QueryResponse rsp = req.process(getRandClient(random())); + try { + NamedList phrases = (NamedList) rsp.getResponse().get("phrases"); + assertEquals("input", input, phrases.get("input")); + assertEquals("summary", expected, phrases.get("summary")); + + final List> details = (List>) phrases.get("details"); + assertNotNull("null details", details); + assertEquals("num phrases found", 2, details.size()); + + final NamedList lazy_dog = details.get(0); + assertEquals("dog text", "the lazy dog", lazy_dog.get("text")); + assertEquals("dog score", 0.166666D, ((Double)lazy_dog.get("score")).doubleValue(), 0.000001D); + + final NamedList brown_fox = details.get(1); + assertEquals("fox text", "brown FOX", brown_fox.get("text")); + assertEquals("fox score", 0.083333D, ((Double)brown_fox.get("score")).doubleValue(), 0.000001D); + + } catch (AssertionError e) { + throw new AssertionError(e.getMessage() + " ::: " + path + " ==> " + rsp, e); + } + } + } + } + + public void testEmptyInput() throws Exception { + // empty input shouldn't error, just produce empty results... + for (String input : Arrays.asList("", " ")) { + for (SolrParams p : Arrays.asList(params("q", "*:*", "phrases.q", input, "phrases", "true"), + params("q", "-*:*", "phrases.q", input, "phrases", "true"))) { + final QueryRequest req = new QueryRequest(p); + req.setPath("/phrases"); + final QueryResponse rsp = req.process(getRandClient(random())); + try { + NamedList phrases = (NamedList) rsp.getResponse().get("phrases"); + assertEquals("input", input, phrases.get("input")); + assertEquals("summary", input, phrases.get("summary")); + + final List> details = (List>) phrases.get("details"); + assertNotNull("null details", details); + assertEquals("num phrases found", 0, details.size()); + + } catch (AssertionError e) { + throw new AssertionError(e.getMessage() + " ==> " + rsp, e); + } + } + } + } + + /** + * returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed + * at a node in our cluster + */ + public static SolrClient getRandClient(Random rand) { + int numClients = CLIENTS.size(); + int idx = TestUtil.nextInt(rand, 0, numClients); + + return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx); + } + + public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception { + assert null != client.getDefaultCollection(); + AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(), + client.getZkStateReader(), + true, true, 330); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java new file mode 100644 index 00000000000..c8d9edfacef --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java @@ -0,0 +1,796 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.function.BiConsumer; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.handler.component.PhrasesIdentificationComponent; +import org.apache.solr.handler.component.PhrasesIdentificationComponent.Phrase; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.params.ShardParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; + +import org.junit.After; +import org.junit.BeforeClass; +import org.junit.Before; +import org.hamcrest.Description; +import org.hamcrest.Matcher; +import org.hamcrest.BaseMatcher; + +public class PhrasesIdentificationComponentTest extends SolrTestCaseJ4 { + + private static final String HANDLER = "/phrases"; + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-phrases-identification.xml","schema-phrases-identification.xml"); + } + + @Before + public void addSomeDocs() throws Exception { + assertU(adoc("id", "42", + "title","Tale of the Brown Fox: was he lazy?", + "body", "No. The quick brown fox was a very brown fox who liked to get into trouble.")); + assertU(adoc("id", "43", + "title","A fable in two acts", + "body", "The brOwn fOx jumped. The lazy dog did not")); + assertU(adoc("id", "44", + "title","Why the LazY dog was lazy", + "body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox")); + assertU(adoc("id", "45", + "title","Why Are We Lazy?", + "body", "Because we are. that's why")); + assertU((commit())); + } + + @After + public void deleteAllDocs() throws Exception { + assertU(delQ("*:*")); + assertU((commit())); + } + + public void testWhiteBoxPhraseParsingLongInput() throws Exception { + final SchemaField field = h.getCore().getLatestSchema().getField("multigrams_body"); + assertNotNull(field); + final List phrases = Phrase.extractPhrases + (" did a Quick brown FOX perniciously jump over the lAZy dog", field, 3, 7); + + assertEquals(IntStream.rangeClosed((11-7+1), 11).sum(), // 11 words, max query phrase size is 7 + phrases.size()); + + // spot check a few explicitly choosen phrases of various lengths... + + { // single term, close to edge so not as many super phrases as other terms might have + final Phrase lazy = phrases.get(phrases.size() - 1 - 2); + final String debug = lazy.toString(); + + assertEquals(debug, "lAZy", lazy.getSubSequence()); + assertEquals(debug, 10, lazy.getPositionStart()); + assertEquals(debug, 11, lazy.getPositionEnd()); + assertEquals(debug, 1, lazy.getPositionLength()); + + assertEquals(debug, 54, lazy.getOffsetStart()); + assertEquals(debug, 58, lazy.getOffsetEnd()); + + assertEquals(debug, 1, lazy.getIndividualIndexedTerms().size()); + assertEquals(debug, 1, lazy.getLargestIndexedSubPhrases().size()); + assertEquals(debug, lazy, lazy.getIndividualIndexedTerms().get(0)); + assertEquals(debug, lazy, lazy.getLargestIndexedSubPhrases().get(0)); + assertEquals(debug, 4, lazy.getIndexedSuperPhrases().size()); // (2 each: len=2, len=3) + } + { // length 2, middle of the pack + final Phrase brown_fox = phrases.get((7 * 3) + 1); + final String debug = brown_fox.toString(); + + assertEquals(debug, "brown FOX", brown_fox.getSubSequence()); + assertEquals(debug, 4, brown_fox.getPositionStart()); + assertEquals(debug, 6, brown_fox.getPositionEnd()); + assertEquals(debug, 2, brown_fox.getPositionLength()); + + assertEquals(debug, 17, brown_fox.getOffsetStart()); + assertEquals(debug, 26, brown_fox.getOffsetEnd()); + + assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size()); + assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size()); + assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0)); + assertEquals(debug, 2, brown_fox.getIndexedSuperPhrases().size()); // (2 @ len=3) + + } + { // length 3 (which is the max indexed size) @ start of the string + final Phrase daq = phrases.get(2); + final String debug = daq.toString(); + + assertEquals(debug, "did a Quick", daq.getSubSequence()); + assertEquals(debug, 1, daq.getPositionStart()); + assertEquals(debug, 4, daq.getPositionEnd()); + assertEquals(debug, 3, daq.getPositionLength()); + + assertEquals(debug, 1, daq.getOffsetStart()); + assertEquals(debug, 13, daq.getOffsetEnd()); + + assertEquals(debug, 3, daq.getIndividualIndexedTerms().size()); + assertEquals(debug, 1, daq.getLargestIndexedSubPhrases().size()); + assertEquals(debug, daq, daq.getLargestIndexedSubPhrases().get(0)); + assertEquals(debug, 0, daq.getIndexedSuperPhrases().size()); + } + { // length 4 phrase (larger then the max indexed size) + final Phrase qbfp = phrases.get((7 * 2) + 3); + final String debug = qbfp.toString(); + + assertEquals(debug, "Quick brown FOX perniciously", qbfp.getSubSequence()); + assertEquals(debug, 3, qbfp.getPositionStart()); + assertEquals(debug, 7, qbfp.getPositionEnd()); + assertEquals(debug, 4, qbfp.getPositionLength()); + + assertEquals(debug, 8, qbfp.getOffsetStart()); + assertEquals(debug, 39, qbfp.getOffsetEnd()); + + assertEquals(debug, 4, qbfp.getIndividualIndexedTerms().size()); + assertEquals(debug, 2, qbfp.getLargestIndexedSubPhrases().size()); + assertEquals(debug, 0, qbfp.getIndexedSuperPhrases().size()); + } + + // some blanket assumptions about the results... + assertBasicSanityChecks(phrases, 11, 3, 7); + } + + public void testWhiteBoxPhraseParsingShortInput() throws Exception { + // for input this short, either of these fields should be (mostly) equivilent + final Map fields = new TreeMap<>(); + fields.put("multigrams_body", 7); + fields.put("multigrams_body_short", 3); + for (Map.Entry entry : fields.entrySet()) { + try { + final int maxQ = entry.getValue(); + final SchemaField field = h.getCore().getLatestSchema().getField(entry.getKey()); + assertNotNull(field); + + // empty input shouldn't break anything + assertEquals(0, Phrase.extractPhrases(random().nextBoolean() ? "" : " ", field, 3, maxQ).size()); + + // input shorter them our index/query phrase sizes shouldn't break anything either.... + final List phrases = Phrase.extractPhrases("brown FOX", field, 3, maxQ); + + assertEquals(3, phrases.size()); + + { // length 2 + final Phrase brown_fox = phrases.get(1); + final String debug = brown_fox.toString(); + + assertEquals(debug, "brown FOX", brown_fox.getSubSequence()); + assertEquals(debug, 1, brown_fox.getPositionStart()); + assertEquals(debug, 3, brown_fox.getPositionEnd()); + assertEquals(debug, 2, brown_fox.getPositionLength()); + + assertEquals(debug, 0, brown_fox.getOffsetStart()); + assertEquals(debug, 9, brown_fox.getOffsetEnd()); + + assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size()); + assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size()); + assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0)); + assertEquals(debug, 0, brown_fox.getIndexedSuperPhrases().size()); + } + { // length 1 + final Phrase fox = phrases.get(2); + final String debug = fox.toString(); + + assertEquals(debug, "FOX", fox.getSubSequence()); + assertEquals(debug, 2, fox.getPositionStart()); + assertEquals(debug, 3, fox.getPositionEnd()); + assertEquals(debug, 1, fox.getPositionLength()); + + assertEquals(debug, 6, fox.getOffsetStart()); + assertEquals(debug, 9, fox.getOffsetEnd()); + + assertEquals(debug, 1, fox.getIndividualIndexedTerms().size()); + assertEquals(debug, 1, fox.getLargestIndexedSubPhrases().size()); + assertEquals(debug, fox, fox.getLargestIndexedSubPhrases().get(0)); + assertEquals(debug, 1, fox.getIndexedSuperPhrases().size()); + } + + assertBasicSanityChecks(phrases, 2, 3, maxQ); + } catch (AssertionError e) { + throw new AssertionError(entry.getKey() + " => " + e.getMessage(), e); + } + } + } + + /** + * Asserts some basic rules that should be enforced about all Phrases + * & their linkages to oher phrases + */ + private void assertBasicSanityChecks(final List phrases, + final int inputPositionLength, + final int maxIndexedPositionLength, + final int maxQueryPositionLength) throws Exception { + assert 0 < phrases.size() : "Don't use this method if phrases might be empty"; + + assertEmptyStream("no phrase should be longer then "+maxQueryPositionLength+" positions", + phrases.stream().filter(p -> p.getPositionLength() > maxQueryPositionLength)); + + assertEmptyStream("no phrase should have a start offset < 0", + phrases.stream().filter(p -> p.getOffsetStart() < 0)); + assertEmptyStream("no phrase should have a start position < 1", + phrases.stream().filter(p -> p.getPositionStart() < 1)); + + assertEmptyStream("If a phrase has a start offset of 0, then it must have position 1", + phrases.stream().filter(p -> (p.getOffsetStart() == 0) + && (p.getPositionStart() != 1))); + + final Phrase first = phrases.get(0); + final Phrase last = phrases.get(phrases.size()-1); + + assertEmptyStream("no phrase should have a start offset < first phrase", + phrases.stream().filter(p -> p.getOffsetStart() < first.getOffsetStart())); + assertEmptyStream("no phrase should have an end offset > last phrase", + phrases.stream().filter(p -> last.getOffsetEnd() < p.getOffsetEnd())); + + assertEmptyStream("no phrase should have a start position < first phrase", + phrases.stream().filter(p -> p.getPositionStart() < first.getPositionStart())); + assertEmptyStream("no phrase should have an end position > last phrase", + phrases.stream().filter(p -> last.getPositionEnd() < p.getPositionEnd())); + + + // NOTE: stuff below this point may not be true for all analyzers (ie: stopwords) + // but should be valid for the analyzers used in this test... + // (if we expand test to cover analyzers w/stopwords, refactor this into a new method) + + for (int n = 1; n <= maxQueryPositionLength; n++) { + final int len = n; + final int expected = Math.max(0, 1 + inputPositionLength - n); + final List sizeN = phrases.stream().filter(p -> p.getPositionLength() == len + ).collect(Collectors.toList()); + assertEquals("Expected # phrases of size " + n + ": " + sizeN, expected, sizeN.size()); + } + + // check the quantities of sub-terms/phrases... + assertEmptyStream("no phrase should have num indexed terms != pos_len", + phrases.stream().filter + (p -> last.getPositionLength() != last.getIndividualIndexedTerms().size())); + assertEmptyStream("no phrase should have num sub-phrases != max(1, 1 + pos_len - "+maxIndexedPositionLength+")", + phrases.stream().filter + (p -> (Math.max(1, 1 + last.getPositionLength() - maxIndexedPositionLength) + != last.getLargestIndexedSubPhrases().size()))); + // NOTE: indexed super phrases can be of various lengths, and differing quantities near + // begining/end of input so don't worry about an exact count, just check their properties (below) + + // check the properties of our sub/super phrases + for (Phrase phrase : phrases) { + final String debug = phrase.toString(); + + assertEmptyStream(debug + " should not have any indexed terms where pos_len != 1", + phrase.getIndividualIndexedTerms().stream().filter + (term -> 1 != term.getPositionLength())); + + assertEmptyStream(debug + " should not have any sub-phrases where pos_len > min(pos_len, " + + maxIndexedPositionLength+")", + phrase.getLargestIndexedSubPhrases().stream().filter + (inner -> (Math.min(phrase.getPositionLength(), maxIndexedPositionLength) + < inner.getPositionLength()))); + + assertEmptyStream(debug + " should not have any super-phrases where super.len <= phrase.len or " + + maxIndexedPositionLength + " < super.len", + phrase.getIndexedSuperPhrases().stream().filter + (outer -> (outer.getPositionLength() <= phrase.getPositionLength() || + maxIndexedPositionLength < outer.getPositionLength()))); + } + } + + public void testWhiteboxStats() throws Exception { + final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body"); + assertNotNull(analysisField); + final String input = "BROWN fox lAzY dog xxxyyyzzz"; + + // a function we'll re-use on phrases generated from the above input + // the multiplier let's us simulate multiple shards returning the same values + BiConsumer> assertions = (mult, phrases) -> { + final Phrase brown_fox = phrases.get(1); + assertEquals("BROWN fox", brown_fox.getSubSequence()); + + assertEquals(mult * 1, brown_fox.getTTF("multigrams_title")); + assertEquals(mult * 1, brown_fox.getDocFreq("multigrams_title")); + assertEquals(mult * 1, brown_fox.getConjunctionDocCount("multigrams_title")); + + assertEquals(mult * 3, brown_fox.getTTF("multigrams_body")); + assertEquals(mult * 2, brown_fox.getDocFreq("multigrams_body")); + assertEquals(mult * 2, brown_fox.getConjunctionDocCount("multigrams_body")); + + final Phrase fox_lazy = phrases.get(6); + assertEquals("fox lAzY", fox_lazy.getSubSequence()); + + assertEquals(mult * 0, fox_lazy.getTTF("multigrams_title")); + assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_title")); + assertEquals(mult * 1, fox_lazy.getConjunctionDocCount("multigrams_title")); + + assertEquals(mult * 0, fox_lazy.getTTF("multigrams_body")); + assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_body")); + assertEquals(mult * 2, fox_lazy.getConjunctionDocCount("multigrams_body")); + + final Phrase bfld = phrases.get(3); + assertEquals("BROWN fox lAzY dog", bfld.getSubSequence()); + + expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_title"); }); + expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_title"); }); + assertEquals(mult * 0, bfld.getConjunctionDocCount("multigrams_title")); + + expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_body"); }); + expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_body"); }); + assertEquals(mult * 1, bfld.getConjunctionDocCount("multigrams_body")); + + final Phrase xyz = phrases.get(phrases.size()-1); + + assertEquals("xxxyyyzzz", xyz.getSubSequence()); + assertEquals(mult * 0, xyz.getTTF("multigrams_title")); + assertEquals(mult * 0, xyz.getDocFreq("multigrams_title")); + assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_title")); + + assertEquals(mult * 0, xyz.getTTF("multigrams_body")); + assertEquals(mult * 0, xyz.getDocFreq("multigrams_body")); + assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_body")); + return; + }; + + + final List phrasesLocal = Phrase.extractPhrases(input, analysisField, 3, 7); + + // freshly parsed phrases, w/o any stats populated, all the stats should be 0 + assertions.accept(0, phrasesLocal); + + // If we populate with our index stats, we should get the basic values in our BiConsumer + try (SolrQueryRequest req = req()) { + Phrase.populateStats(phrasesLocal, Arrays.asList("multigrams_body","multigrams_title"), + req.getSearcher()); + } + assertions.accept(1, phrasesLocal); + + // likewise, if we create a new freshly parsed set of phrases, and "merge" in the previous index stats + // (ie: merge results from one shard) we should get the same results + final List phrasesMerged = Phrase.extractPhrases(input, analysisField, 3, 7); + Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal)); + assertions.accept(1, phrasesMerged); + + // if we merge in a second copy of the same results (ie: two identical shards) + // our results should be double what we had before + Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal)); + assertions.accept(2, phrasesMerged); + + } + + public void testWhiteboxScores() throws Exception { + final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body"); + assertNotNull(analysisField); + final Map fieldWeights = new TreeMap<>(); + fieldWeights.put("multigrams_title", 1.0D); + fieldWeights.put("multigrams_body", 0.0D); // NOTE: 0 weighting should only affect total score + + final String input = "xxxyyyzzz BROWN fox why are we lAzY"; + final List phrases = Phrase.extractPhrases(input, analysisField, 3, 7); + try (SolrQueryRequest req = req()) { + Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher()); + } + Phrase.populateScores(phrases, fieldWeights, 3, 7); + + // do some basic sanity checks of the field & total scores... + + for (Phrase xyz : phrases.subList(0, 7)) { + // first 7 all start with xyz which isn't in index (in either field) so all scores should be -1 + assertEquals(xyz.toString(), -1.0D, xyz.getTotalScore(), 0.0D); + assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_title"), 0.0D); + assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_body"), 0.0D); + } + + // any individual terms (past xyz) should score 0.0 because they are all actually in the index + // (in both fields) + for (Phrase term : phrases.subList(7, phrases.size()).stream().filter + ((p -> 1 == p.getPositionLength())).collect(Collectors.toList())) { + + assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_title"), 0.0D); + assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_body"), 0.0D); + assertEquals(term.toString(), 0.0D, term.getTotalScore(), 0.0D); + } + + // "brown fox" should score positively in both fields, and overall... + final Phrase brown_fox = phrases.get(8); + assertEquals("BROWN fox", brown_fox.getSubSequence()); + assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D)); + assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_body"), greaterThan(0.0D) ); + assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D)); + + // "we lazy" does appear in a title value, but should score poorly given how often the terms + // are used in other contexts, and should score -1 against body -- but because of our weights, + // that shouldn't bring down the total + final Phrase we_lazy = phrases.get(phrases.size()-2); + assertEquals("we lAzY", we_lazy.getSubSequence()); + assertEquals(we_lazy.toString(), -1.0D, we_lazy.getFieldScore("multigrams_body"), 0.0D); + assertThat(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), lessThan(0.0D)); + assertThat(we_lazy.toString(), we_lazy.getTotalScore(), lessThan(0.0D)); + assertEquals(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), we_lazy.getTotalScore(), + 0.0D); + + // "why are we lazy" is longer then the max indexed phrase size & appears verbatim in a title value + // it should score -1 against body -- but because of our weights, that shouldn't bring down the total + final Phrase wawl = phrases.get(phrases.size()-7); + assertEquals("why are we lAzY", wawl.getSubSequence()); + assertEquals(wawl.toString(), -1.0D, wawl.getFieldScore("multigrams_body"), 0.0D); + assertThat(wawl.toString(), wawl.getFieldScore("multigrams_title"), greaterThan(0.0D)); + assertThat(wawl.toString(), wawl.getTotalScore(), greaterThan(0.0D)); + assertEquals(wawl.toString(), wawl.getFieldScore("multigrams_title"), wawl.getTotalScore(), + 0.0D); + + // "brown fox why are we" is longer then the max indexed phrase, and none of it's + // (longest) sub phrases exists in either field -- so all of it's scores should be -1 + final Phrase bfwaw = phrases.get(11); + assertEquals("BROWN fox why are we", bfwaw.getSubSequence()); + assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_title"), 0.0D); + assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_body"), 0.0D); + assertEquals(bfwaw.toString(), -1.0D, bfwaw.getTotalScore(), 0.0D); + + } + + public void testWhiteboxScorcesStopwords() throws Exception { + final String input = "why the lazy dog brown fox"; + final Map fieldWeights = new TreeMap<>(); + fieldWeights.put("multigrams_title", 1.0D); + fieldWeights.put("multigrams_title_stop", 1.0D); + + { // If our analysisField uses all terms, + // be we also generate scores from a field that filters stopwords... + final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title"); + assertNotNull(analysisField); + + final List phrases = Phrase.extractPhrases(input, analysisField, 3, 7); + try (SolrQueryRequest req = req()) { + Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher()); + } + Phrase.populateScores(phrases, fieldWeights, 3, 7); + + // phrases that span the stop word should have valid scores from the field that doesn't care + // about stop words, but the stopword field should reject them + final Phrase why_the_lazy = phrases.get(2); + assertEquals("why the lazy", why_the_lazy.getSubSequence()); + assertThat(why_the_lazy.toString(), why_the_lazy.getFieldScore("multigrams_title"), greaterThan(0.0D) ); + assertEquals(why_the_lazy.toString(), -1.0D, why_the_lazy.getFieldScore("multigrams_title_stop"), 0.0D); + + final Phrase the_lazy_dog = phrases.get(8); + assertEquals("the lazy dog", the_lazy_dog.getSubSequence()); + assertThat(the_lazy_dog.toString(), the_lazy_dog.getFieldScore("multigrams_title"), greaterThan(0.0D) ); + assertEquals(the_lazy_dog.toString(), -1.0D, the_lazy_dog.getFieldScore("multigrams_title_stop"), 0.0D); + + // sanity check that good scores are still possible with stopwords + // "brown fox" should score positively in both fields, and overall... + final Phrase brown_fox = phrases.get(phrases.size()-2); + assertEquals("brown fox", brown_fox.getSubSequence()); + assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D)); + assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) ); + assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D)); + } + + { // now flip things: our analysisField filters stopwords, + // but we also generates scores from a field that doesn't know about them... + // + // (NOTE: the parser will still generate _some_ candidate phrases spaning the stop word position, + // but not ones that start with the stopword) + final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title_stop"); + assertNotNull(analysisField); + + final List phrases = Phrase.extractPhrases(input, analysisField, 3, 7); + try (SolrQueryRequest req = req()) { + Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher()); + } + Phrase.populateScores(phrases, fieldWeights, 3, 7); + assertTrue(phrases.toString(), 0 < phrases.size()); + + for (Phrase p : phrases) { + if (p.getPositionStart() <= 2 && 2 < p.getPositionEnd()) { + // phrases that span the stop word should have valid scores from the field that doesn't care + // about stop words, but the stopword field should reject them + assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title"), 0.0D); + assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title_stop"), 0.0D); + } + } + + // sanity check that good scores are still possible with stopwords + // "brown fox" should score positively in both fields, and overall... + final Phrase brown_fox = phrases.get(phrases.size()-2); + assertEquals("brown fox", brown_fox.getSubSequence()); + assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D)); + assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) ); + assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D)); + } + + } + + public void testExpectedUserErrors() throws Exception { + assertQEx("empty field list should error", + "must specify a (weighted) list of fields", + req("q","foo", "phrases","true", + "phrases.fields", " "), + ErrorCode.BAD_REQUEST); + + assertQEx("bogus field name should error", + "does not exist", + req("q","foo", "phrases","true", + "phrases.fields", "bogus1 bogus2"), + ErrorCode.BAD_REQUEST); + + assertQEx("lack of shingles should cause error", + "Unable to determine max position length", + req("q","foo", "phrases","true", + "phrases.fields", "title"), + ErrorCode.BAD_REQUEST); + + assertQEx("analyzer missmatch should cause error", + "must have the same fieldType", + req("q","foo", "phrases","true", + "phrases.fields", "multigrams_title multigrams_title_short"), + ErrorCode.BAD_REQUEST); + + assertQEx("analysis field must exist", + "does not exist", + req("q","foo", "phrases","true", + "phrases.analysis.field", "bogus", + "phrases.fields", "multigrams_title multigrams_title_short"), + ErrorCode.BAD_REQUEST); + + assertQEx("no query param should error", + "requires a query string", + req("qt", "/phrases", + "phrases.fields", "multigrams_title"), + ErrorCode.BAD_REQUEST); + } + + public void testMaxShingleSizeHelper() throws Exception { + IndexSchema schema = h.getCore().getLatestSchema(); + + assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize + (schema.getFieldTypeByName("multigrams_3_7").getIndexAnalyzer())); + assertEquals(7, PhrasesIdentificationComponent.getMaxShingleSize + (schema.getFieldTypeByName("multigrams_3_7").getQueryAnalyzer())); + + assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize + (schema.getFieldTypeByName("multigrams_3").getIndexAnalyzer())); + assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize + (schema.getFieldTypeByName("multigrams_3").getQueryAnalyzer())); + + assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize + (schema.getFieldTypeByName("text").getIndexAnalyzer())); + assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize + (schema.getFieldTypeByName("text").getQueryAnalyzer())); + + } + + public void testSimplePhraseRequest() throws Exception { + final String input = " did a Quick brown FOX perniciously jump over the lazy dog"; + final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}"; + + // should get same behavior regardless of wether we use "q" or "phrases.q" + for (String p : Arrays.asList("q", "phrases.q")) { + // basic request... + assertQ(req("qt", HANDLER, p, input) + // expect no search results... + , "count(//result)=0" + + // just phrase info... + , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']" + , "//lst[@name='phrases']/str[@name='summary'][.='"+expected+"']" + , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 2" + // + , "//lst[@name='phrases']/arr[@name='details']/lst[1]/str[@name='text'][.='the lazy dog']" + , "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_start'][.='50']" + , "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_end'][.='62']" + , "//lst[@name='phrases']/arr[@name='details']/lst[1]/double[@name='score'][number(.) > 0]" + // + , "//lst[@name='phrases']/arr[@name='details']/lst[2]/str[@name='text'][.='brown FOX']" + , "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_start'][.='17']" + , "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_end'][.='26']" + , "//lst[@name='phrases']/arr[@name='details']/lst[2]/double[@name='score'][number(.) > 0]" + ); + + // empty input, empty phrases (and no error)... + assertQ(req("qt", HANDLER, p, "") + // expect no search results... + , "count(//result)=0" + // just empty phrase info for our empty input... + , "//lst[@name='phrases']/str[@name='input'][.='']" + , "//lst[@name='phrases']/str[@name='summary'][.='']" + , "count(//lst[@name='phrases']/arr[@name='details']) = 1" + , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0" + ); + } + } + + public void testSimpleSearchRequests() throws Exception { + final String input = "\"brown fox\""; + + assertQ(req("q", input) + // basic search should have worked... + , "//result[@numFound='2']" + , "//result/doc/str[@name='id'][.='42']" + , "//result/doc/str[@name='id'][.='43']" + // and phrases should not be returned since they weren't requested... + , "0=count(//lst[@name='phrases'])" + ); + + assertQ(req("phrases", "false", "q", input) + // basic search should have worked... + , "//result[@numFound='2']" + , "//result/doc/str[@name='id'][.='42']" + , "//result/doc/str[@name='id'][.='43']" + // and phrases should not be returned since they were explicitly disabled... + , "0=count(//lst[@name='phrases'])" + ); + + // with input this short, all of these permutations of requests should produce the same output... + for (SolrQueryRequest req : Arrays.asList + ( // simple, using 3/7 defaults + req("phrases","true", "q", input), + + // simple, using just the 3/3 'short' fields + req("phrases","true", "q", input, + "phrases.fields", "multigrams_body_short multigrams_title_short^2"), + + // diff analysers, but explicit override using 3/3 "short" field... + req("phrases","true", "q", input, + "phrases.fields", "multigrams_body multigrams_title_short^2", + "phrases.analysis.field", "multigrams_title_short"))) { + assertQ(req + // basic search should have worked... + , "//result[@numFound='2']" + , "//result/doc/str[@name='id'][.='42']" + , "//result/doc/str[@name='id'][.='43']" + + // and we should have gotten phrase info... + , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']" + , "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']" + , "count(//lst[@name='phrases']/arr[@name='details']/lst)=1" + , "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']" + , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']" + , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']" + , "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]" + ); + } + + // override the query string to get different phrases + assertQ(req("phrases","true", "q", "*:*", "phrases.q", input) + // basic search should have found all docs... + , "//result[@numFound='4']" + // and we should have gotten phrase info for our alternative q string... + , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']" + , "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']" + , "count(//lst[@name='phrases']/arr[@name='details']/lst)=1" + , "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']" + , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']" + , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']" + , "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]" + ); + + // empty input, empty phrases (but no error) + assertQ(req("phrases","true", "q", "*:*", "phrases.q", "") + // basic search should have found all docs... + , "//result[@numFound='4']" + // and we should have gotten (empty) phrase info for our alternative q string... + , "//lst[@name='phrases']/str[@name='input'][.='']" + , "//lst[@name='phrases']/str[@name='summary'][.='']" + , "count(//lst[@name='phrases']/arr[@name='details']) = 1" + , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0" + ); + } + + public void testGreyboxShardSearchRequests() throws Exception { + final String input = "quick brown fox ran"; + + final String phrase_xpath = "//lst[@name='phrases']"; + final String all_phrase_xpath = phrase_xpath + "/arr[@name='_all']"; + + // phrases requested, and correct request stage / shard purpose ... + assertQ(req("q", input, + "phrases","true", + ShardParams.IS_SHARD, "true", + ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE) + + // this shard request should have caused stats to be returned about all phrases... + , "10=count("+ all_phrase_xpath +"/lst)" + // "quick" ... + , all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_body'][.='1']" + , all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_title'][.='0']" + // ... + // "brown fox" + , all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_body'][.='3']" + , all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_title'][.='1']" + , all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_body'][.='2']" + , all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_title'][.='1']" + , all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_body'][.='2']" + , all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_title'][.='1']" + + // but no computed "scores"... + , "0=count("+phrase_xpath+"//*[@name='score'])" + ); + + // phrases requested, but incorrect request stage / shard purpose ... + assertQ(req("q", input, + "phrases","true", + ShardParams.IS_SHARD, "true", + ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS) + , "0=count("+ phrase_xpath +"/lst)"); + + // phrases disabled, regardless of request stage / shard purpose ... + assertTrue("sanity check failed, stage was modified in code w/o updating test", + PhrasesIdentificationComponent.SHARD_PURPOSE != ShardRequest.PURPOSE_GET_FIELDS); + assertQ(req("q", input, + "phrases","false", + ShardParams.IS_SHARD, "true", + ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS) + , "0=count("+ phrase_xpath +"/lst)"); + assertQ(req("q", input, + "phrases","false", + ShardParams.IS_SHARD, "true", + ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE) + , "0=count("+ phrase_xpath +"/lst)"); + } + + + + // //////////////////////////////////////////////////////////////// + + + + + /** + * Trivial Helper method that collects & compares to an empty List so + * the assertion shows the unexpected stream elements + */ + public void assertEmptyStream(final String msg, final Stream stream) { + assertEquals(msg, + Collections.emptyList(), + stream.collect(Collectors.toList())); + } + + /** helper, docs for future junit/hamcrest seems to have something similar */ + public static Matcher lessThan(double expected) { + return new BaseMatcher() { + @Override public boolean matches(Object actual) { + return ((Double)actual).compareTo(expected) < 0; + } + @Override public void describeTo(Description d) { + d.appendText("should be less than " + expected); + } + }; + } + /** helper, docs for future junit/hamcrest seems to have something similar */ + public static Matcher greaterThan(double expected) { + return new BaseMatcher() { + @Override public boolean matches(Object actual) { + return 0 < ((Double)actual).compareTo(expected); + } + @Override public void describeTo(Description d) { + d.appendText("should be greater than " + expected); + } + }; + } +}