diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 19db81ef9d7..3d947c7de0a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -1,4 +1,4 @@
- Apache Solr Release Notes
+ Apache Solr Release Notes
Introduction
------------
@@ -208,6 +208,9 @@ New Features
doc transformers if present. In 7.5 a missing 'fl' defaults to the current behavior of all fields, but in 8.0
defaults to the top/request "fl". (Moshe Bla, David Smiley)
+* SOLR-9418: Added a new (experimental) PhrasesIdentificationComponent for identifying potential phrases
+ in query input based on overlapping shingles in the index. (Akash Mehta, Trey Grainger, hossman)
+
Bug Fixes
----------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java b/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java
new file mode 100644
index 00000000000..bac5a4c089a
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java
@@ -0,0 +1,1129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.component;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.LongSummaryStatistics;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
+
+import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.client.solrj.SolrResponse;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.util.SolrPluginUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * A component that can be used in isolation, or in conjunction with {@link QueryComponent} to identify
+ * & score "phrases" found in the input string, based on shingles in indexed fields.
+ *
+ *
+ * The most common way to use this component is in conjunction with field that use
+ * {@link ShingleFilterFactory} on both the index
and query
analyzers.
+ * An example field type configuration would be something like this...
+ *
+ *
+ * <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer type="index">
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
+ * </analyzer>
+ * <analyzer type="query">
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
+ * </analyzer>
+ * </fieldType>
+ *
+ *
+ * ...where the query
analyzer's maxShingleSize="7"
determines the maximum
+ * possible phrase length that can be hueristically deduced, the index
analyzer's
+ * maxShingleSize="3"
determines the accuracy of phrases identified. The large the
+ * indexed maxShingleSize
the higher the accuracy. Both analyzers must include
+ * minShingleSize="2" outputUnigrams="true"
.
+ *
+ *
+ * With a field type like this, one or more fields can be specified (with weights) via a
+ * phrases.fields
param to request that this component identify possible phrases in the
+ * input q
param, or an alternative phrases.q
override param. The identified
+ * phrases will include their scores relative each field specified, as well an overal weighted score based
+ * on the field weights provided by the client. Higher score values indicate a greater confidence in the
+ * Phrase.
+ *
+ *
+ *
+ * NOTE: In a distributed request, this component uses a single phase (piggy backing on the
+ * {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use) to
+ * collect all field & shingle stats. No "refinement" requests are used.
+ *
+ *
+ * @lucene.experimental
+ */
+public class PhrasesIdentificationComponent extends SearchComponent {
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ /** The only shard purpose that will cause this component to do work & return data during shard req */
+ public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS;
+
+ /** Name, also used as a request param to identify whether the user query concerns this component */
+ public static final String COMPONENT_NAME = "phrases";
+
+ // TODO: ideally these should live in a commons.params class?
+ public static final String PHRASE_INPUT = "phrases.q";
+ public static final String PHRASE_FIELDS = "phrases.fields";
+ public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field";
+ public static final String PHRASE_SUMMARY_PRE = "phrases.pre";
+ public static final String PHRASE_SUMMARY_POST = "phrases.post";
+ public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index";
+ public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query";
+
+ @Override
+ public void prepare(ResponseBuilder rb) throws IOException {
+ final SolrParams params = rb.req.getParams();
+ if (!params.getBool(COMPONENT_NAME, false)) {
+ return;
+ }
+ if (params.getBool(ShardParams.IS_SHARD, false)) {
+ // only one stage/purpose where we should do any work on a shard
+ if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) {
+ return;
+ }
+ }
+
+ // if we're still here, then we should parse & validate our input,
+ // putting it in the request context so our process method knows it should do work
+ rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req));
+ }
+
+ @Override
+ public int distributedProcess(ResponseBuilder rb) {
+ final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
+ if (null == contextData) {
+ // if prepare didn't give us anything to work with, then we should do nothing
+ return ResponseBuilder.STAGE_DONE;
+ }
+
+ if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
+ return ResponseBuilder.STAGE_EXECUTE_QUERY;
+
+ } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
+ // if we're being used in conjunction with QueryComponent, it should have already created
+ // (in this staged) the only ShardRequest we need...
+ for (ShardRequest sreq : rb.outgoing) {
+ if (0 != (SHARD_PURPOSE & sreq.purpose) ) {
+ return ResponseBuilder.STAGE_GET_FIELDS;
+ }
+ }
+ // ...if we can't find it, then evidently we're being used in isolation,
+ // and we need to create our own ShardRequest...
+ ShardRequest sreq = new ShardRequest();
+ sreq.purpose = SHARD_PURPOSE;
+ sreq.params = new ModifiableSolrParams(rb.req.getParams());
+ sreq.params.remove(ShardParams.SHARDS);
+ rb.addRequest(this, sreq);
+ return ResponseBuilder.STAGE_GET_FIELDS;
+
+ } else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
+ // NOTE: we don't do any actual work in this stage, but we need to ensure that even if
+ // we are being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS
+ // so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results
+ // (w/o needing extra code paths for merging phrase results when QueryComponent is/is not used)
+ return ResponseBuilder.STAGE_DONE;
+ }
+
+ return ResponseBuilder.STAGE_DONE;
+ }
+
+ @Override
+ public void finishStage(ResponseBuilder rb) {
+ // NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with
+ // QueryComponent, we don't want to add our results to the response until *after*
+ // QueryComponent adds the main DocList
+
+ final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
+ if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) {
+ // if prepare didn't give us anything to work with, or this isn't our stage, then do nothing
+ return;
+ }
+
+ // sanity check: the shard requests we use/piggy-back on should only hapen once per shard,
+ // but let's future proof ourselves against the possibility that some shards might get/respond
+ // to the same request "purpose" multiple times...
+ final BitSet shardsHandled = new BitSet(rb.shards.length);
+
+ // Collect Shard responses
+ for (ShardRequest sreq : rb.finished) {
+ if (0 != (sreq.purpose & SHARD_PURPOSE)) {
+ for (ShardResponse shardRsp : sreq.responses) {
+ final int shardNum = rb.getShardNum(shardRsp.getShard());
+ if (! shardsHandled.get(shardNum)) {
+ shardsHandled.set(shardNum);
+ // shards.tolerant=true can cause nulls on exceptions/errors
+ // if we don't get phrases/stats from a shard, just ignore that shard
+ final SolrResponse rsp = shardRsp.getSolrResponse();
+ if (null == rsp) continue;
+ final NamedList top = rsp.getResponse();
+ if (null == top) continue;
+ final NamedList phrasesWrapper = (NamedList) top.get("phrases");
+ if (null == phrasesWrapper) continue;
+ final List> shardPhrases = (List>) phrasesWrapper.get("_all");
+ if (null == shardPhrases) continue;
+
+ Phrase.populateStats(contextData.allPhrases, shardPhrases);
+ }
+ }
+ }
+ }
+ scoreAndAddResultsToResponse(rb, contextData);
+ }
+
+
+ @Override
+ public void process(ResponseBuilder rb) throws IOException {
+ final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
+ if (null == contextData) {
+ // if prepare didn't give us anything to work with, then we should do nothing
+ return;
+ }
+
+ // regardless of single node / shard, we need local stats...
+ Phrase.populateStats(contextData.allPhrases, contextData.fieldWeights.keySet(), rb.req.getSearcher());
+
+ if ( rb.req.getParams().getBool(ShardParams.IS_SHARD, false) ) {
+ // shard request, return stats for all phrases (in original order)
+ SimpleOrderedMap output = new SimpleOrderedMap<>();
+ output.add("_all", Phrase.formatShardResponse(contextData.allPhrases));
+ // TODO: might want to add numDocs() & getSumTotalTermFreq(f)/getDocCount(f) stats from each field...
+ // so that we can sum/merge them for use in scoring?
+ rb.rsp.add("phrases", output);
+ } else {
+ // full single node request...
+ scoreAndAddResultsToResponse(rb, contextData);
+ }
+ }
+
+ /**
+ * Helper method (suitable for both single node & distributed coordinator node) to
+ * score, sort, and format the end user response once all phrases have been populated with stats.
+ */
+ private void scoreAndAddResultsToResponse(final ResponseBuilder rb, final PhrasesContextData contextData) {
+ assert null != contextData : "Should not be called if no phrase data to use";
+ if (null == contextData) {
+ // if prepare didn't give us anything to work with, then we should do nothing
+ return;
+ }
+
+ SimpleOrderedMap output = new SimpleOrderedMap<>();
+ rb.rsp.add("phrases", output);
+ output.add("input", contextData.rawInput);
+
+ if (0 == contextData.allPhrases.size()) {
+ // w/o any phrases, the summary is just the input again...
+ output.add("summary", contextData.rawInput);
+ output.add("details", Collections.emptyList());
+ return;
+ }
+
+ Phrase.populateScores(contextData);
+ final int maxPosition = contextData.allPhrases.get(contextData.allPhrases.size()-1).getPositionEnd();
+
+ final List validScoringPhrasesSorted = contextData.allPhrases.stream()
+ // TODO: ideally this cut off of "0.0" should be a request option...
+ // so users can tune how aggresive/conservative they want to be in finding phrases
+ // but for that to be useful, we need:
+ // - more hard & fast documentation about the "range" of scores that may be returned
+ // - "useful" scores for single words
+ .filter(p -> 0.0D < p.getTotalScore())
+ .sorted(Comparator.comparing((p -> p.getTotalScore()), Collections.reverseOrder()))
+ .collect(Collectors.toList());
+
+ // we want to return only high scoring phrases that don't overlap w/higher scoring phrase
+ final BitSet positionsCovered = new BitSet(maxPosition+1);
+ final List results = new ArrayList<>(maxPosition);
+ for (Phrase phrase : validScoringPhrasesSorted) {
+ final BitSet phrasePositions = phrase.getPositionsBitSet();
+
+ if (! phrasePositions.intersects(positionsCovered)) {
+ // we can use this phrase, record it...
+ positionsCovered.or(phrasePositions);
+ results.add(phrase);
+ } // else: overlaps higher scoring position(s), skip this phrase
+
+ if (positionsCovered.cardinality() == maxPosition+1) {
+ // all positions are covered, so we can bail out and skip the rest
+ break;
+ }
+ }
+
+ // a "quick summary" of the suggested parsing
+ output.add("summary", contextData.summarize(results));
+ // useful user level info on every (high scoring) phrase found (in current, descending score, order)
+ output.add("details", results.stream()
+ .map(p -> p.getDetails()).collect(Collectors.toList()));
+ }
+
+ @Override
+ public String getDescription() {
+ return "Phrases Identification Component";
+ }
+
+ /**
+ * Simple container for all request options and data this component needs to store in the Request Context
+ * @lucene.internal
+ */
+ public static final class PhrasesContextData {
+
+ public final String rawInput;
+ public final int maxIndexedPositionLength;
+ public final int maxQueryPositionLength;
+ public final Map fieldWeights;
+ public final SchemaField analysisField;
+ public final List allPhrases;
+ public final String summaryPre;
+ public final String summaryPost;
+
+ // TODO: add an option to bias field weights based on sumTTF of the fields
+ // (easy enough to "sum the sums" across multiple shards before scoring)
+
+ /**
+ * Parses the params included in this request, throwing appropriate user level
+ * Exceptions for invalid input, and returning a PhrasesContextData
+ * suitable for use in this request.
+ */
+ public static PhrasesContextData parseAndValidateRequest(final SolrQueryRequest req) throws SolrException {
+ return new PhrasesContextData(req);
+ }
+ private PhrasesContextData(final SolrQueryRequest req) throws SolrException {
+ final SolrParams params = req.getParams();
+
+ this.rawInput = params.get(PHRASE_INPUT, params.get(CommonParams.Q));
+ if (null == this.rawInput) {
+ throw new SolrException(ErrorCode.BAD_REQUEST, "phrase identification requires a query string or "
+ + PHRASE_INPUT + " param override");
+ }
+
+ { // field weights & analysis field...
+
+ SchemaField tmpAnalysisField = null;
+ Map tmpWeights = new TreeMap<>();
+
+ final String analysisFieldName = params.get(PHRASE_ANALYSIS_FIELD);
+ if (null != analysisFieldName) {
+ tmpAnalysisField = req.getSchema().getFieldOrNull(analysisFieldName);
+ if (null == tmpAnalysisField) {
+ throw new SolrException(ErrorCode.BAD_REQUEST,
+ PHRASE_ANALYSIS_FIELD + " param specifies a field name that does not exist: " +
+ analysisFieldName);
+ }
+ }
+
+ final Map rawFields = SolrPluginUtils.parseFieldBoosts(params.getParams(PHRASE_FIELDS));
+ if (rawFields.isEmpty()) {
+ throw new SolrException(ErrorCode.BAD_REQUEST,
+ PHRASE_FIELDS + " param must specify a (weighted) list of fields " +
+ "to evaluate for phrase identification");
+ }
+
+ for (Map.Entry entry : rawFields.entrySet()) {
+ final SchemaField field = req.getSchema().getFieldOrNull(entry.getKey());
+ if (null == field) {
+ throw new SolrException(ErrorCode.BAD_REQUEST,
+ PHRASE_FIELDS + " param contains a field name that does not exist: " +
+ entry.getKey());
+ }
+ if (null == tmpAnalysisField) {
+ tmpAnalysisField = field;
+ }
+ if ( null == analysisFieldName ) {
+ if (! field.getType().equals(tmpAnalysisField.getType())) {
+ throw new SolrException
+ (ErrorCode.BAD_REQUEST,
+ "All fields specified in " + PHRASE_FIELDS + " must have the same fieldType, " +
+ "or the advanced " + PHRASE_ANALYSIS_FIELD + " option must specify an override");
+ }
+ }
+ // if a weight isn't specified, assume "1.0"
+ final double weight = null == entry.getValue() ? 1.0D : entry.getValue();
+ if (weight < 0) {
+ throw new SolrException(ErrorCode.BAD_REQUEST,
+ PHRASE_FIELDS + " param must use non-negative weight value for field " + field.getName());
+ }
+ tmpWeights.put(entry.getKey(), weight);
+ }
+ assert null != tmpAnalysisField;
+
+ this.analysisField = tmpAnalysisField;
+ this.fieldWeights = Collections.unmodifiableMap(tmpWeights);
+ }
+
+ { // index/query max phrase sizes...
+ final FieldType ft = analysisField.getType();
+ this.maxIndexedPositionLength = req.getParams().getInt(PHRASE_INDEX_MAXLEN,
+ getMaxShingleSize(ft.getIndexAnalyzer()));
+ if (this.maxIndexedPositionLength < 0) {
+ throw new SolrException(ErrorCode.BAD_REQUEST,
+ "Unable to determine max position length of indexed phrases using " +
+ "index analyzer for analysis field: " + analysisField.getName() +
+ " and no override detected using param: " + PHRASE_INDEX_MAXLEN);
+ }
+ this.maxQueryPositionLength = req.getParams().getInt(PHRASE_QUERY_MAXLEN,
+ getMaxShingleSize(ft.getQueryAnalyzer()));
+ if (this.maxQueryPositionLength < 0) {
+ throw new SolrException(ErrorCode.BAD_REQUEST,
+ "Unable to determine max position length of query phrases using " +
+ "query analyzer for analysis field: " + analysisField.getName() +
+ " and no override detected using param: " + PHRASE_QUERY_MAXLEN);
+ }
+ if (this.maxQueryPositionLength < this.maxIndexedPositionLength) {
+ throw new SolrException
+ (ErrorCode.BAD_REQUEST,
+ "Effective value of " + PHRASE_INDEX_MAXLEN + " (either from index analyzer shingle factory, " +
+ " or expert param override) must be less then or equal to the effective value of " +
+ PHRASE_QUERY_MAXLEN + " (either from query analyzer shingle factory, or expert param override)");
+ }
+ }
+
+ this.summaryPre = params.get(PHRASE_SUMMARY_PRE, "{");
+ this.summaryPost = params.get(PHRASE_SUMMARY_POST, "}");
+
+ this.allPhrases = Phrase.extractPhrases(this.rawInput, this.analysisField,
+ this.maxIndexedPositionLength,
+ this.maxQueryPositionLength);
+
+ }
+
+ /**
+ * Given a list of phrases to be returned to the user, summarizes those phrases by decorating the
+ * original input string to indicate where the identified phrases exist, using {@link #summaryPre}
+ * and {@link #summaryPost}
+ *
+ * @param results a list of (non overlapping) Phrases that have been identified, sorted from highest scoring to lowest
+ * @return the original user input, decorated to indicate the identified phrases
+ */
+ public String summarize(final List results) {
+ final StringBuffer out = new StringBuffer(rawInput);
+
+ // sort by *reverse* position so we can go back to front
+ final List reversed = results.stream()
+ .sorted(Comparator.comparing((p -> p.getPositionStart()), Collections.reverseOrder()))
+ .collect(Collectors.toList());
+
+ for (Phrase p : reversed) {
+ out.insert(p.getOffsetEnd(), summaryPost);
+ out.insert(p.getOffsetStart(), summaryPre);
+ }
+ return out.toString();
+ }
+ }
+
+
+ /**
+ * Model the data known about a single (candidate) Phrase -- which may or may not be indexed
+ * @lucene.internal
+ */
+ public static final class Phrase {
+
+ /**
+ * Factory method for constructing a list of Phrases given the specified input and using the analyzer
+ * for the specified field. The maxIndexedPositionLength
and
+ * maxQueryPositionLength
provided *must* match the effective values used by
+ * respective analyzers.
+ */
+ public static List extractPhrases(final String input, final SchemaField analysisField,
+ final int maxIndexedPositionLength,
+ final int maxQueryPositionLength) {
+
+ // TODO: rather then requiring the query analyzer to produce the Phrases for us (assuming Shingles)
+ // we could potentially just require that it produces unigrams compatible with the unigrams in the
+ // indexed fields, and then build our own Phrases at query time -- making the maxQueryPositionLength
+ // a 100% run time configuration option.
+ // But that could be tricky given an arbitrary analyzer -- we'd have pay careful attention
+ // to positions, and we'd have to guess/assume what placeholders/fillers was used in the indexed Phrases
+ // (typically shingles)
+
+ assert maxIndexedPositionLength <= maxQueryPositionLength;
+
+ final CharsRefBuilder buffer = new CharsRefBuilder();
+ final FieldType ft = analysisField.getType();
+ final Analyzer analyzer = ft.getQueryAnalyzer();
+ final List results = new ArrayList<>(42);
+ try (TokenStream tokenStream = analyzer.tokenStream(analysisField.getName(), input)) {
+
+ final OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
+ final PositionIncrementAttribute posIncAttr = tokenStream.addAttribute(PositionIncrementAttribute.class);
+ final PositionLengthAttribute posLenAttr = tokenStream.addAttribute(PositionLengthAttribute.class);
+ final TermToBytesRefAttribute termAttr = tokenStream.addAttribute(TermToBytesRefAttribute.class);
+
+ int position = 0;
+ int lastPosLen = -1;
+
+ tokenStream.reset();
+ while (tokenStream.incrementToken()) {
+ final Phrase phrase = new Phrase();
+
+ final int posInc = posIncAttr.getPositionIncrement();
+ final int posLen = posLenAttr.getPositionLength();
+
+ if (0 == posInc && posLen <= lastPosLen) {
+ // This requirement of analyzers to return tokens in ascending order of length
+ // is currently neccessary for the "linking" logic below to work
+ // if people run into real world sitautions where this is problematic,
+ // we can relax this check if we also make the linking logic more complex
+ // (ie: less optimzied)
+ throw new SolrException
+ (ErrorCode.BAD_REQUEST, "Phrase identification currently requires that " +
+ "the analyzer used must produce tokens that overlap in increasing order of length. ");
+ }
+
+ position += posInc;
+ lastPosLen = posLen;
+
+ phrase.position_start = position;
+ phrase.position_end = position + posLen;
+
+ phrase.is_indexed = (posLen <= maxIndexedPositionLength);
+
+ phrase.offset_start = offsetAttr.startOffset();
+ phrase.offset_end = offsetAttr.endOffset();
+
+ // populate the subsequence directly from the raw input using the offsets,
+ // (instead of using the TermToBytesRefAttribute) so we preserve the original
+ // casing, whitespace, etc...
+ phrase.subSequence = input.subSequence(phrase.offset_start, phrase.offset_end);
+
+ if (phrase.is_indexed) {
+ // populate the bytes so we can build term queries
+ phrase.bytes = BytesRef.deepCopyOf(termAttr.getBytesRef());
+ }
+
+ results.add(phrase);
+ }
+ tokenStream.end();
+ } catch (IOException e) {
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ "Analysis error extracting phrases from: " + input, e);
+ }
+
+ // fill in the relationships of each phrase
+ //
+ // NOTE: this logic currently requries that the phrases are sorted by position ascending
+ // (automatic because of how PositionIncrementAttribute works) then by length ascending
+ // (when positions are tied).
+ // We could de-optimize this code if we find that secondary ordering is too restrictive for
+ // some analyzers
+ //
+ // NOTE changes to scoring model may be allow optimize/prune down the relationships tracked,
+ // ...OR.... may require us to add/track more details about sub/parent phrases
+ //
+ for (int p = 0; p < results.size(); p++) {
+ final Phrase current = results.get(p);
+ if (! current.is_indexed) {
+ // we're not an interesting sub phrase of anything
+ continue;
+ }
+
+ // setup links from the phrase to itself if needed
+ addLinkages(current, current, maxIndexedPositionLength);
+
+ // scan backwards looking for phrases that might include us...
+ BEFORE: for (int i = p-1; 0 <= i; i--) {
+ final Phrase previous = results.get(i);
+ if (previous.position_start < (current.position_end - maxQueryPositionLength)) {
+ // we've scanned so far back nothing else is viable
+ break BEFORE;
+ }
+ // any 'previous' phrases must start where current starts or earlier,
+ // so only need to check the end...
+ if (current.position_end <= previous.position_end) {
+ addLinkages(previous, current, maxIndexedPositionLength);
+ }
+ }
+ // scan forwards looking for phrases that might include us...
+ AFTER: for (int i = p+1; i < results.size(); i++) {
+ final Phrase next = results.get(i);
+ // the only way a phrase that comes after current can include current is
+ // if they have the same start position...
+ if (current.position_start != next.position_start) {
+ // we've scanned so far forward nothing else is viable
+ break AFTER;
+ }
+ // any 'next' phrases must start where current starts, so only need to check the end...
+ if (current.position_end <= next.position_end) {
+ addLinkages(next, current, maxIndexedPositionLength);
+ }
+ }
+ }
+
+ return Collections.unmodifiableList(results);
+ }
+
+ /**
+ * Given two phrases, one of which is a super set of the other, adds the neccessary linkages
+ * needed by the scoring model
+ */
+ private static void addLinkages(final Phrase outer, final Phrase inner,
+ final int maxIndexedPositionLength) {
+
+ assert outer.position_start <= inner.position_start;
+ assert inner.position_end <= outer.position_end;
+ assert inner.is_indexed;
+
+ final int inner_len = inner.getPositionLength();
+ if (1 == inner_len) {
+ outer.individualIndexedTerms.add(inner);
+ }
+ if (maxIndexedPositionLength == inner_len
+ || (inner == outer && inner_len < maxIndexedPositionLength)) {
+ outer.largestIndexedSubPhrases.add(inner);
+ }
+ if (outer.is_indexed && inner != outer) {
+ inner.indexedSuperPhrases.add(outer);
+ }
+ }
+
+ /**
+ * Format the phrases suitable for returning in a shard response
+ * @see #populateStats(List,List)
+ */
+ public static List> formatShardResponse(final List phrases) {
+ List> results = new ArrayList<>(phrases.size());
+ for (Phrase p : phrases) {
+ NamedList data = new SimpleOrderedMap<>();
+ // quick and dirty way to validate that our shards aren't using different analyzers
+ // so the coordinating node can fail fast when mergingthe results
+ data.add("checksum", p.getChecksum());
+ if (p.is_indexed) {
+ data.add("ttf", new NamedList(p.phrase_ttf));
+ data.add("df", new NamedList(p.phrase_df));
+ }
+ data.add("conj_dc", new NamedList(p.subTerms_conjunctionCounts));
+
+ results.add(data);
+ }
+ return results;
+ }
+
+ /**
+ * Populates the phrases with (merged) stats from a remote shard
+ * @see #formatShardResponse
+ */
+ public static void populateStats(final List phrases, final List> shardData) {
+ final int numPhrases = phrases.size();
+ if (shardData.size() != numPhrases) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "num phrases in shard data not consistent: " +
+ numPhrases + " vs " + shardData.size());
+ }
+ for (int i = 0; i < phrases.size(); i++) {
+ // rather then being paranoid about the expected structure, we'll just let the low level
+ // code throw an NPE / CCE / AIOOBE / etc. and wrap & rethrow later...
+ try {
+ final Phrase p = phrases.get(i);
+ final NamedList data = shardData.get(i);
+ // sanity check the correct phrase
+ if (! p.getChecksum().equals(data.get("checksum"))) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "phrase #" + i + " in shard data had invalid checksum");
+ }
+ if (p.is_indexed) {
+ for (Map.Entry ttf : (NamedList) data.get("ttf")) {
+ p.phrase_ttf.merge(ttf.getKey(), ttf.getValue(), Long::sum);
+ }
+ for (Map.Entry df : (NamedList) data.get("df")) {
+ p.phrase_df.merge(df.getKey(), df.getValue(), Long::sum);
+ }
+ }
+ for (Map.Entry conj_dc : (NamedList) data.get("conj_dc")) {
+ p.subTerms_conjunctionCounts.merge(conj_dc.getKey(), conj_dc.getValue(), Long::sum);
+ }
+ } catch (RuntimeException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "shard data for phrase#" + i + " not consistent", e);
+ }
+ }
+ }
+
+ /**
+ * Populates the phrases with stats from the local index for the specified fields
+ */
+ public static void populateStats(final List phrases, final Collection fieldNames,
+ final SolrIndexSearcher searcher) throws IOException {
+ final IndexReader reader = searcher.getIndexReader();
+ for (String field : fieldNames) {
+ for (Phrase phrase : phrases) {
+ if (phrase.is_indexed) {
+ // add stats based on this entire phrase as an indexed term
+ final Term t = new Term(field, phrase.bytes);
+ phrase.phrase_ttf.put(field, reader.totalTermFreq(t));
+ phrase.phrase_df.put(field, (long)reader.docFreq(t));
+ }
+
+ // even if our phrase is too long to be indexed whole, add stats based on the
+ // conjunction of all the individual terms in the phrase
+ List filters = new ArrayList<>(phrase.individualIndexedTerms.size());
+ for (Phrase term : phrase.individualIndexedTerms) {
+ // trust the SolrIndexSearcher to cache & intersect the individual terms so that this
+ // can be efficient regardless of how often terms are re-used multiple times in the input/phrases
+ filters.add(new TermQuery(new Term(field, term.bytes)));
+ }
+ final long count = searcher.getDocSet(filters).size();
+ phrase.subTerms_conjunctionCounts.put(field, count);
+ }
+ }
+ }
+
+ /**
+ * Uses the previously popuated stats to populate each Phrase with it's scores for the specified fields,
+ * and it's over all (weighted) total score. This is not needed on shard requests.
+ *
+ * @see #populateStats
+ * @see #getFieldScore(String)
+ * @see #getTotalScore
+ */
+ public static void populateScores(final PhrasesContextData contextData) {
+ populateScores(contextData.allPhrases, contextData.fieldWeights,
+ contextData.maxIndexedPositionLength,
+ contextData.maxQueryPositionLength);
+ }
+
+ /**
+ * Public for testing purposes
+ * @see #populateScores(PhrasesIdentificationComponent.PhrasesContextData)
+ * @lucene.internal
+ */
+ public static void populateScores(final List phrases, final Map fieldWeights,
+ final int maxIndexedPositionLength,
+ final int maxQueryPositionLength) {
+ final double total_weight = fieldWeights.values().stream().mapToDouble(Double::doubleValue).sum();
+ for (Phrase phrase : phrases) {
+ double phrase_cumulative_score = 0.0D;
+ for (Map.Entry entry : fieldWeights.entrySet()) {
+ final String field = entry.getKey();
+ final double weight = entry.getValue();
+ double field_score = computeFieldScore(phrase, field,
+ maxIndexedPositionLength, maxQueryPositionLength);
+ phrase.fieldScores.put(field,field_score);
+ phrase_cumulative_score += (field_score * weight);
+ }
+ phrase.total_score = (total_weight < 0 ? Double.NEGATIVE_INFINITY
+ : (phrase_cumulative_score / total_weight));
+ }
+ }
+
+ private Phrase() {
+ // No-Op
+ }
+
+ private boolean is_indexed;
+ private double total_score = -1.0D; // until we get a computed score, this is "not a phrase"
+
+ private CharSequence subSequence;
+ private BytesRef bytes;
+ private int offset_start;
+ private int offset_end;
+ private int position_start;
+ private int position_end;
+ private Integer checksum = null;
+
+ /** NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves */
+ private final List individualIndexedTerms = new ArrayList<>(7);
+ /**
+ * NOTE: Indexed phrases of length less then the max indexed length are the (sole)
+ * largest sub-phrases of themselves
+ */
+ private final List largestIndexedSubPhrases = new ArrayList<>(7);
+ /** Phrases larger then this phrase which are indexed and fully contain it */
+ private final List indexedSuperPhrases = new ArrayList<>(7);
+
+ // NOTE: keys are field names
+ private final Map subTerms_conjunctionCounts = new TreeMap<>();
+ private final Map phrase_ttf = new TreeMap<>();
+ private final Map phrase_df = new TreeMap<>();
+ private final Map fieldScores = new TreeMap<>();
+
+ public String toString() {
+ return "'" + subSequence + "'"
+ + "[" + offset_start + ":" + offset_end + "]"
+ + "[" + position_start + ":" + position_end + "]";
+ }
+
+ public NamedList getDetails() {
+ SimpleOrderedMap out = new SimpleOrderedMap();
+ out.add("text", subSequence);
+ out.add("offset_start", getOffsetStart());
+ out.add("offset_end", getOffsetEnd());
+ out.add("score", getTotalScore());
+ out.add("field_scores", fieldScores);
+ return out;
+ }
+
+ /**
+ * Computes & caches the checksum of this Phrase (if not already cached).
+ * needed only when merging shard data to validate no inconsistencies with the remote shards
+ */
+ private Integer getChecksum() {
+ if (null == checksum) {
+ checksum = Arrays.hashCode(new int[] { offset_start, offset_end, position_start, position_end });
+ }
+ return checksum;
+ }
+ /** The characters from the original input that corrispond with this Phrase */
+ public CharSequence getSubSequence() {
+ return subSequence;
+ }
+
+ /**
+ * Returns the list of "individual" (ie: getPositionLength()==1
terms.
+ * NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves
+ */
+ public List getIndividualIndexedTerms() {
+ return individualIndexedTerms;
+ }
+ /**
+ * Returns the list of (overlapping) sub phrases that have the largest possible size based on
+ * the effective value of {@link PhrasesContextData#maxIndexedPositionLength}.
+ * NOTE: Indexed phrases of length less then the max indexed length are the (sole)
+ * largest sub-phrases of themselves.
+ */
+ public List getLargestIndexedSubPhrases() {
+ return largestIndexedSubPhrases;
+ }
+ /**
+ * Returns all phrases larger then this phrase, which fully include this phrase, and are indexed.
+ * NOTE: A Phrase is never the super phrase of itself.
+ */
+ public List getIndexedSuperPhrases() {
+ return indexedSuperPhrases;
+ }
+
+ /** NOTE: positions start at '1' */
+ public int getPositionStart() {
+ return position_start;
+ }
+ /** NOTE: positions start at '1' */
+ public int getPositionEnd() {
+ return position_end;
+ }
+ public int getPositionLength() {
+ return position_end - position_start;
+ }
+ /** Each set bit identifies a position filled by this Phrase */
+ public BitSet getPositionsBitSet() {
+ final BitSet result = new BitSet();
+ result.set(position_start, position_end);
+ return result;
+ }
+ public int getOffsetStart() {
+ return offset_start;
+ }
+ public int getOffsetEnd() {
+ return offset_end;
+ }
+
+ /**
+ * Returns the overall score for this Phrase. In the current implementation,
+ * the only garuntee made regarding the range of possible values is that 0 (or less) means
+ * it is not a good phrase.
+ *
+ * @return A numeric value indicating the confidence in this Phrase, higher numbers are higher confidence.
+ */
+ public double getTotalScore() {
+ return total_score;
+ }
+ /**
+ * Returns the score for this Phrase in this given field. In the current implementation,
+ * the only garuntee made regarding the range of possible values is that 0 (or less) means
+ * it is not a good phrase.
+ *
+ * @return A numeric value indicating the confidence in this Phrase for this field, higher numbers are higher confidence.
+ */
+ public double getFieldScore(String field) {
+ return fieldScores.getOrDefault(field, -1.0D);
+ }
+
+ /**
+ * Returns the number of total TTF of this (indexed) Phrase as term in the specified field.
+ * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats}
+ * methods has been called with this field.
+ */
+ public long getTTF(String field) {
+ if (!is_indexed) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "TTF is only available for indexed phrases");
+ }
+ return phrase_ttf.getOrDefault(field, 0L);
+ }
+ /**
+ * Returns the number of documents that contain all of the {@link #getIndividualIndexedTerms}
+ * that make up this Phrase, in the specified field.
+ * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats}
+ * methods has been called with this field.
+ */
+ public long getConjunctionDocCount(String field) {
+ return subTerms_conjunctionCounts.getOrDefault(field, 0L);
+ }
+ /**
+ * Returns the number of documents that contain this (indexed) Phrase as term
+ * in the specified field.
+ * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats}
+ * methods has been called with this field.
+ */
+ public long getDocFreq(String field) {
+ if (!is_indexed) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "DF is only available for indexed phrases");
+ }
+ return phrase_df.getOrDefault(field, 0L);
+ }
+
+ /**
+ * Uses the previously popuated stats to compute a score for the specified field.
+ *
+ *
+ * The current implementation returns scores in the range of [0,1]
, but this
+ * may change in future implementations. The only current garuntees are:
+ *
+ *
+ *
+ * 0 (or less) means this is garunteed to not be a phrase
+ * larger numbers are higher confidence
+ *
+ *
+ * @see #populateStats
+ * @see #populateScores
+ * @see #getFieldScore(String)
+ * @return a score value
+ */
+ private static double computeFieldScore(final Phrase input,
+ final String field,
+ final int maxIndexedPositionLength,
+ final int maxQueryPositionLength) {
+ final long num_indexed_sub_phrases = input.getLargestIndexedSubPhrases().size();
+ assert 0 <= num_indexed_sub_phrases; // should be impossible
+
+ if (input.getIndividualIndexedTerms().size() < input.getPositionLength()) {
+ // there are "gaps" in our input, where individual words have not been indexed (stop words,
+ // or multivalue position gap) which means we are not a viable candidate for being a valid Phrase.
+ return -1.0D;
+ }
+
+ final long phrase_conj_count = input.getConjunctionDocCount(field);
+ // if there isn't a single document containing all the terms in our
+ // phrase, then it is 100% not a phrase
+ if (phrase_conj_count <= 0) {
+ return -1.0D;
+ }
+
+ // single words automatically score 0.0 (unless they already scored less for not existing
+ if (input.getPositionLength() <= 1) {
+ return 0.0D;
+ }
+
+ double field_score = 0.0D;
+ long max_sub_conj_count = phrase_conj_count;
+
+ // At the moment, the contribution of each "words" sub-Phrase to the field score to the input
+ // Phrase is independent of any context of "input". Depending on if/how sub-phrase scoring
+ // changes, we might consider computing the scores of all the indexed phrases first, and
+ // aching the portions of their values that are re-used when computing the scores of
+ // longer phrases?
+ //
+ // This would make the overall scoring of all phrases a lot more complicated,
+ // but could save CPU cycles?
+ // (particularly when maxIndexedPositionLength <<< maxQueryPositionLength ???)
+ //
+ // My gut says that knowing the conj_count(input) "context" should help us score the
+ // sub-phrases better, but i can't yet put my finger on why/how. maybe by comparing
+ // the conj_count(input) to the max(conj_count(parent of words)) ?
+
+ // for each of the longest indexed phrases, aka indexed sub-sequence of "words", we have...
+ for (Phrase words : input.getLargestIndexedSubPhrases()) {
+ // we're going to compute scores in range of [-1:1] to indicate the likelihood that our
+ // "words" should be used as a "phrase", based on a bayesian document categorization model,
+ // where the "words as a phrase" (aka: phrase) is our candidate category.
+ //
+ // P(words|phrase) * P(phrase) - P(words|not phrase) * P(not phrase)
+ //
+ // Where...
+ // P(words|phrase) = phrase_ttf / min(word_ttf)
+ // P(phrase) =~ phrase_docFreq / conj_count(words in phrase) *SEE NOTE BELOW*
+ // P(words|not phrase) = phrase_ttf / max(word_ttf)
+ // P(not a phrase) = 1 - P(phrase)
+ //
+ // ... BUT! ...
+ //
+ // NOTE: we're going to reduce our "P(phrase) by the max "P(phrase)" of all the (indexed)
+ // candidate phrases we are a sub-phrase of, to try to offset the inherent bias in favor
+ // of small indexed phrases -- because anytime the super-phrase exists, the sub-phrase exists
+
+
+ // IDEA: consider replacing this entire baysian model with LLR (or rootLLR)...
+ // http://mahout.apache.org/docs/0.13.0/api/docs/mahout-math/org/apache/mahout/math/stats/LogLikelihood.html
+ // ...where we compute LLR over each of the TTF of the pairs of adjacent sub-phrases of each
+ // indexed phrase and take the min|max|avg of the LLR scores.
+ //
+ // ie: for indexed shingle "quick brown fox" compute LLR(ttf("quick"), ttf("brown fox")) &
+ // LLR(ttf("quick brown"), ttf("fox")) using ttf("quick brown fox") as the co-occurance
+ // count, and sumTTF-ttf("quick")-ttf("brown")-ttf("fox") as the "something else"
+ //
+ // (we could actually compute LLR stats over TTF and DF and combine them)
+ //
+ // NOTE: Going the LLR/rootLLR route would require building a full "tree" of every (indexed)
+ // sub-phrase of every other phrase (or at least: all siblings of diff sizes that add up to
+ // an existing phrase). As well as require us to give up on a predictible "range" of
+ // legal values for scores (IIUC from the LLR docs)
+
+ final long phrase_ttf = words.getTTF(field);
+ final long phrase_df = words.getDocFreq(field);
+ final long words_conj_count = words.getConjunctionDocCount(field);
+ max_sub_conj_count = Math.max(words_conj_count, max_sub_conj_count);
+
+ final double max_wrapper_phrase_probability =
+ words.getIndexedSuperPhrases().stream()
+ .mapToDouble(p -> p.getConjunctionDocCount(field) <= 0 ?
+ // special case check -- we already know *our* conj count > 0,
+ // but we need a similar check for wrapper phrases: if <= 0, their probability is 0
+ 0.0D : ((double)p.getDocFreq(field) / p.getConjunctionDocCount(field))).max().orElse(0.0D);
+
+ final LongSummaryStatistics words_ttfs =
+ words.getIndividualIndexedTerms().stream()
+ .collect(Collectors.summarizingLong(t -> t.getTTF(field)));
+
+ final double words_phrase_prob = (phrase_ttf / (double)words_ttfs.getMin());
+ final double words_not_phrase_prob = (phrase_ttf / (double)words_ttfs.getMax());
+
+ final double phrase_prob = (phrase_conj_count / (double)words_conj_count);
+
+
+ final double phrase_score = words_phrase_prob * (phrase_prob - max_wrapper_phrase_probability);
+ final double not_phrase_score = words_not_phrase_prob * (1 - (phrase_prob - max_wrapper_phrase_probability));
+ final double words_score = phrase_score - not_phrase_score;
+
+ field_score += words_score;
+ }
+
+ // NOTE: the "scaling" factors below can "increase" negative scores (by reducing the unsigned value)
+ // when they should ideally be penalizing the scores further, but since we currently don't care
+ // about any score lower then 0, it's not worth worrying about.
+
+ // Average the accumulated score over the number of actual indexed sub-phrases that contributed
+ //
+ // NOTE: since we subsequently want to multiply the score by a fraction with num_indexed_sub_phrases
+ // in the numerator, we can skip this...
+ // SEE BELOW // field_score /= (double) num_indexed_sub_phrases;
+
+ // If we leave field_score as is, then a phrase longer then the maxIndexedPositionLength
+ // will never score higher then the highest scoring sub-phrase it has (because we've averaged them)
+ // so we scale the scores against the longest possible phrase length we're considering
+ //
+ // NOTE: We don't use num_indexed_sub_phrases in the numerator since we skipped it when
+ // averating above...
+ field_score *= ( 1.0D // SEE ABOVE // * ( (double)num_indexed_sub_phrases )
+ / (1 + maxQueryPositionLength - maxIndexedPositionLength) );
+
+ // scale the field_score based on the ratio of the conjunction docCount for the whole phrase
+ // realtive to the largest conjunction docCount of it's (largest indexed) sub phrases, to penalize
+ // the scores of very long phrases that exist very rarely relative to the how often their
+ // sub phrases exist in the index
+ field_score *= ( ((double) phrase_conj_count) / max_sub_conj_count);
+
+ return field_score;
+ }
+ }
+
+ /**
+ * Helper method, public for testing purposes only.
+ *
+ * Given an analyzer, inspects it to determine if:
+ *
+ * it is a {@link TokenizerChain}
+ * it contains exactly one instance of {@link ShingleFilterFactory}
+ *
+ *
+ * If these these conditions are met, then this method returns the maxShingleSize
+ * in effect for this analyzer, otherwise returns -1.
+ *
+ *
+ * @param analyzer An analyzer inspect
+ * @return maxShingleSize
if available
+ * @lucene.internal
+ */
+ public static int getMaxShingleSize(Analyzer analyzer) {
+ if (!TokenizerChain.class.isInstance(analyzer)) {
+ return -1;
+ }
+
+ final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
+ if (0 == factories.length) {
+ return -1;
+ }
+ int result = -1;
+ for (TokenFilterFactory tff : factories) {
+ if (ShingleFilterFactory.class.isInstance(tff)) {
+ if (0 < result) {
+ // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
+ return -1;
+ }
+ // would be nice if there was an easy way to just ask a factory for the effective value
+ // of an arguement...
+ final Map args = tff.getOriginalArgs();
+ result = args.containsKey("maxShingleSize")
+ ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
+ }
+ }
+ return result;
+ }
+}
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml b/solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml
new file mode 100644
index 00000000000..ab38f9fee3b
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-phrases-identification.xml
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ id
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml
new file mode 100644
index 00000000000..65ccd5e0965
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrases-identification.xml
@@ -0,0 +1,53 @@
+
+
+
+
+ ${tests.luceneMatchVersion:LATEST}
+
+
+
+
+
+
+
+
+ phrases
+
+
+ explicit
+ true
+ body
+ multigrams_body multigrams_title^2
+
+
+
+
+
+
+ phrases
+
+
+ explicit
+ true
+ true
+ multigrams_body multigrams_title^2
+
+
+
+
+
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java
new file mode 100644
index 00000000000..cbe1cdce946
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudPhrasesIdentificationComponent.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import java.lang.invoke.MethodHandles;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.lucene.util.LuceneTestCase.Slow;
+import org.apache.lucene.util.TestUtil;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+/**
+ * A very simple sanity check that Phrase Identification works across a cloud cluster
+ * using distributed term stat collection.
+ *
+ * @see org.apache.solr.handler.component.PhrasesIdentificationComponentTest
+ */
+@Slow
+public class TestCloudPhrasesIdentificationComponent extends SolrCloudTestCase {
+
+ private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName();
+ private static final String COLLECTION_NAME = DEBUG_LABEL + "_collection";
+
+ /** A basic client for operations at the cloud level, default collection will be set */
+ private static CloudSolrClient CLOUD_CLIENT;
+ /** One client per node */
+ private static ArrayList CLIENTS = new ArrayList<>(5);
+
+ @BeforeClass
+ private static void createMiniSolrCloudCluster() throws Exception {
+
+ // multi replicas should not matter...
+ final int repFactor = usually() ? 1 : 2;
+ // ... but we definitely want to test multiple shards
+ final int numShards = TestUtil.nextInt(random(), 1, (usually() ? 2 :3));
+ final int numNodes = (numShards * repFactor);
+
+ final String configName = DEBUG_LABEL + "_config-set";
+ final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf");
+
+ configureCluster(numNodes).addConfig(configName, configDir).configure();
+
+ Map collectionProperties = new LinkedHashMap<>();
+ collectionProperties.put("config", "solrconfig-phrases-identification.xml");
+ collectionProperties.put("schema", "schema-phrases-identification.xml");
+ CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor)
+ .setProperties(collectionProperties)
+ .process(cluster.getSolrClient());
+
+ CLOUD_CLIENT = cluster.getSolrClient();
+ CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME);
+
+ waitForRecoveriesToFinish(CLOUD_CLIENT);
+
+ for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
+ CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/"));
+ }
+
+ // index some docs...
+ CLOUD_CLIENT.add
+ (sdoc("id", "42",
+ "title","Tale of the Brown Fox: was he lazy?",
+ "body", "No. The quick brown fox was a very brown fox who liked to get into trouble."));
+ CLOUD_CLIENT.add
+ (sdoc("id", "43",
+ "title","A fable in two acts",
+ "body", "The brOwn fOx jumped. The lazy dog did not"));
+ CLOUD_CLIENT.add
+ (sdoc("id", "44",
+ "title","Why the LazY dog was lazy",
+ "body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox"));
+ CLOUD_CLIENT.add
+ (sdoc("id", "45",
+ "title","Why Are We Lazy?",
+ "body", "Because we are. that's why"));
+ CLOUD_CLIENT.commit();
+ }
+
+ @AfterClass
+ private static void afterClass() throws Exception {
+ CLOUD_CLIENT.close(); CLOUD_CLIENT = null;
+ for (HttpSolrClient client : CLIENTS) {
+ client.close();
+ }
+ CLIENTS = null;
+ }
+
+ public void testBasicPhrases() throws Exception {
+ final String input = " did a Quick brown FOX perniciously jump over the lazy dog";
+ final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}";
+
+ // based on the documents indexed, these assertions should all pass regardless of
+ // how many shards we have, or wether the request is done via /phrases or /select...
+ for (String path : Arrays.asList("/select", "/phrases")) {
+ // ... or if we muck with "q" and use the alternative phrases.q for the bits we care about...
+ for (SolrParams p : Arrays.asList(params("q", input, "phrases", "true"),
+ params("q", "*:*", "phrases.q", input, "phrases", "true"),
+ params("q", "-*:*", "phrases.q", input, "phrases", "true"))) {
+ final QueryRequest req = new QueryRequest(p);
+ req.setPath(path);
+ final QueryResponse rsp = req.process(getRandClient(random()));
+ try {
+ NamedList phrases = (NamedList) rsp.getResponse().get("phrases");
+ assertEquals("input", input, phrases.get("input"));
+ assertEquals("summary", expected, phrases.get("summary"));
+
+ final List> details = (List>) phrases.get("details");
+ assertNotNull("null details", details);
+ assertEquals("num phrases found", 2, details.size());
+
+ final NamedList lazy_dog = details.get(0);
+ assertEquals("dog text", "the lazy dog", lazy_dog.get("text"));
+ assertEquals("dog score", 0.166666D, ((Double)lazy_dog.get("score")).doubleValue(), 0.000001D);
+
+ final NamedList brown_fox = details.get(1);
+ assertEquals("fox text", "brown FOX", brown_fox.get("text"));
+ assertEquals("fox score", 0.083333D, ((Double)brown_fox.get("score")).doubleValue(), 0.000001D);
+
+ } catch (AssertionError e) {
+ throw new AssertionError(e.getMessage() + " ::: " + path + " ==> " + rsp, e);
+ }
+ }
+ }
+ }
+
+ public void testEmptyInput() throws Exception {
+ // empty input shouldn't error, just produce empty results...
+ for (String input : Arrays.asList("", " ")) {
+ for (SolrParams p : Arrays.asList(params("q", "*:*", "phrases.q", input, "phrases", "true"),
+ params("q", "-*:*", "phrases.q", input, "phrases", "true"))) {
+ final QueryRequest req = new QueryRequest(p);
+ req.setPath("/phrases");
+ final QueryResponse rsp = req.process(getRandClient(random()));
+ try {
+ NamedList phrases = (NamedList) rsp.getResponse().get("phrases");
+ assertEquals("input", input, phrases.get("input"));
+ assertEquals("summary", input, phrases.get("summary"));
+
+ final List> details = (List>) phrases.get("details");
+ assertNotNull("null details", details);
+ assertEquals("num phrases found", 0, details.size());
+
+ } catch (AssertionError e) {
+ throw new AssertionError(e.getMessage() + " ==> " + rsp, e);
+ }
+ }
+ }
+ }
+
+ /**
+ * returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed
+ * at a node in our cluster
+ */
+ public static SolrClient getRandClient(Random rand) {
+ int numClients = CLIENTS.size();
+ int idx = TestUtil.nextInt(rand, 0, numClients);
+
+ return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx);
+ }
+
+ public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception {
+ assert null != client.getDefaultCollection();
+ AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(),
+ client.getZkStateReader(),
+ true, true, 330);
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java
new file mode 100644
index 00000000000..c8d9edfacef
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java
@@ -0,0 +1,796 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.component;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.function.BiConsumer;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.handler.component.PhrasesIdentificationComponent;
+import org.apache.solr.handler.component.PhrasesIdentificationComponent.Phrase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+
+import org.junit.After;
+import org.junit.BeforeClass;
+import org.junit.Before;
+import org.hamcrest.Description;
+import org.hamcrest.Matcher;
+import org.hamcrest.BaseMatcher;
+
+public class PhrasesIdentificationComponentTest extends SolrTestCaseJ4 {
+
+ private static final String HANDLER = "/phrases";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-phrases-identification.xml","schema-phrases-identification.xml");
+ }
+
+ @Before
+ public void addSomeDocs() throws Exception {
+ assertU(adoc("id", "42",
+ "title","Tale of the Brown Fox: was he lazy?",
+ "body", "No. The quick brown fox was a very brown fox who liked to get into trouble."));
+ assertU(adoc("id", "43",
+ "title","A fable in two acts",
+ "body", "The brOwn fOx jumped. The lazy dog did not"));
+ assertU(adoc("id", "44",
+ "title","Why the LazY dog was lazy",
+ "body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox"));
+ assertU(adoc("id", "45",
+ "title","Why Are We Lazy?",
+ "body", "Because we are. that's why"));
+ assertU((commit()));
+ }
+
+ @After
+ public void deleteAllDocs() throws Exception {
+ assertU(delQ("*:*"));
+ assertU((commit()));
+ }
+
+ public void testWhiteBoxPhraseParsingLongInput() throws Exception {
+ final SchemaField field = h.getCore().getLatestSchema().getField("multigrams_body");
+ assertNotNull(field);
+ final List phrases = Phrase.extractPhrases
+ (" did a Quick brown FOX perniciously jump over the lAZy dog", field, 3, 7);
+
+ assertEquals(IntStream.rangeClosed((11-7+1), 11).sum(), // 11 words, max query phrase size is 7
+ phrases.size());
+
+ // spot check a few explicitly choosen phrases of various lengths...
+
+ { // single term, close to edge so not as many super phrases as other terms might have
+ final Phrase lazy = phrases.get(phrases.size() - 1 - 2);
+ final String debug = lazy.toString();
+
+ assertEquals(debug, "lAZy", lazy.getSubSequence());
+ assertEquals(debug, 10, lazy.getPositionStart());
+ assertEquals(debug, 11, lazy.getPositionEnd());
+ assertEquals(debug, 1, lazy.getPositionLength());
+
+ assertEquals(debug, 54, lazy.getOffsetStart());
+ assertEquals(debug, 58, lazy.getOffsetEnd());
+
+ assertEquals(debug, 1, lazy.getIndividualIndexedTerms().size());
+ assertEquals(debug, 1, lazy.getLargestIndexedSubPhrases().size());
+ assertEquals(debug, lazy, lazy.getIndividualIndexedTerms().get(0));
+ assertEquals(debug, lazy, lazy.getLargestIndexedSubPhrases().get(0));
+ assertEquals(debug, 4, lazy.getIndexedSuperPhrases().size()); // (2 each: len=2, len=3)
+ }
+ { // length 2, middle of the pack
+ final Phrase brown_fox = phrases.get((7 * 3) + 1);
+ final String debug = brown_fox.toString();
+
+ assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
+ assertEquals(debug, 4, brown_fox.getPositionStart());
+ assertEquals(debug, 6, brown_fox.getPositionEnd());
+ assertEquals(debug, 2, brown_fox.getPositionLength());
+
+ assertEquals(debug, 17, brown_fox.getOffsetStart());
+ assertEquals(debug, 26, brown_fox.getOffsetEnd());
+
+ assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
+ assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
+ assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
+ assertEquals(debug, 2, brown_fox.getIndexedSuperPhrases().size()); // (2 @ len=3)
+
+ }
+ { // length 3 (which is the max indexed size) @ start of the string
+ final Phrase daq = phrases.get(2);
+ final String debug = daq.toString();
+
+ assertEquals(debug, "did a Quick", daq.getSubSequence());
+ assertEquals(debug, 1, daq.getPositionStart());
+ assertEquals(debug, 4, daq.getPositionEnd());
+ assertEquals(debug, 3, daq.getPositionLength());
+
+ assertEquals(debug, 1, daq.getOffsetStart());
+ assertEquals(debug, 13, daq.getOffsetEnd());
+
+ assertEquals(debug, 3, daq.getIndividualIndexedTerms().size());
+ assertEquals(debug, 1, daq.getLargestIndexedSubPhrases().size());
+ assertEquals(debug, daq, daq.getLargestIndexedSubPhrases().get(0));
+ assertEquals(debug, 0, daq.getIndexedSuperPhrases().size());
+ }
+ { // length 4 phrase (larger then the max indexed size)
+ final Phrase qbfp = phrases.get((7 * 2) + 3);
+ final String debug = qbfp.toString();
+
+ assertEquals(debug, "Quick brown FOX perniciously", qbfp.getSubSequence());
+ assertEquals(debug, 3, qbfp.getPositionStart());
+ assertEquals(debug, 7, qbfp.getPositionEnd());
+ assertEquals(debug, 4, qbfp.getPositionLength());
+
+ assertEquals(debug, 8, qbfp.getOffsetStart());
+ assertEquals(debug, 39, qbfp.getOffsetEnd());
+
+ assertEquals(debug, 4, qbfp.getIndividualIndexedTerms().size());
+ assertEquals(debug, 2, qbfp.getLargestIndexedSubPhrases().size());
+ assertEquals(debug, 0, qbfp.getIndexedSuperPhrases().size());
+ }
+
+ // some blanket assumptions about the results...
+ assertBasicSanityChecks(phrases, 11, 3, 7);
+ }
+
+ public void testWhiteBoxPhraseParsingShortInput() throws Exception {
+ // for input this short, either of these fields should be (mostly) equivilent
+ final Map fields = new TreeMap<>();
+ fields.put("multigrams_body", 7);
+ fields.put("multigrams_body_short", 3);
+ for (Map.Entry entry : fields.entrySet()) {
+ try {
+ final int maxQ = entry.getValue();
+ final SchemaField field = h.getCore().getLatestSchema().getField(entry.getKey());
+ assertNotNull(field);
+
+ // empty input shouldn't break anything
+ assertEquals(0, Phrase.extractPhrases(random().nextBoolean() ? "" : " ", field, 3, maxQ).size());
+
+ // input shorter them our index/query phrase sizes shouldn't break anything either....
+ final List phrases = Phrase.extractPhrases("brown FOX", field, 3, maxQ);
+
+ assertEquals(3, phrases.size());
+
+ { // length 2
+ final Phrase brown_fox = phrases.get(1);
+ final String debug = brown_fox.toString();
+
+ assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
+ assertEquals(debug, 1, brown_fox.getPositionStart());
+ assertEquals(debug, 3, brown_fox.getPositionEnd());
+ assertEquals(debug, 2, brown_fox.getPositionLength());
+
+ assertEquals(debug, 0, brown_fox.getOffsetStart());
+ assertEquals(debug, 9, brown_fox.getOffsetEnd());
+
+ assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
+ assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
+ assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
+ assertEquals(debug, 0, brown_fox.getIndexedSuperPhrases().size());
+ }
+ { // length 1
+ final Phrase fox = phrases.get(2);
+ final String debug = fox.toString();
+
+ assertEquals(debug, "FOX", fox.getSubSequence());
+ assertEquals(debug, 2, fox.getPositionStart());
+ assertEquals(debug, 3, fox.getPositionEnd());
+ assertEquals(debug, 1, fox.getPositionLength());
+
+ assertEquals(debug, 6, fox.getOffsetStart());
+ assertEquals(debug, 9, fox.getOffsetEnd());
+
+ assertEquals(debug, 1, fox.getIndividualIndexedTerms().size());
+ assertEquals(debug, 1, fox.getLargestIndexedSubPhrases().size());
+ assertEquals(debug, fox, fox.getLargestIndexedSubPhrases().get(0));
+ assertEquals(debug, 1, fox.getIndexedSuperPhrases().size());
+ }
+
+ assertBasicSanityChecks(phrases, 2, 3, maxQ);
+ } catch (AssertionError e) {
+ throw new AssertionError(entry.getKey() + " => " + e.getMessage(), e);
+ }
+ }
+ }
+
+ /**
+ * Asserts some basic rules that should be enforced about all Phrases
+ * & their linkages to oher phrases
+ */
+ private void assertBasicSanityChecks(final List phrases,
+ final int inputPositionLength,
+ final int maxIndexedPositionLength,
+ final int maxQueryPositionLength) throws Exception {
+ assert 0 < phrases.size() : "Don't use this method if phrases might be empty";
+
+ assertEmptyStream("no phrase should be longer then "+maxQueryPositionLength+" positions",
+ phrases.stream().filter(p -> p.getPositionLength() > maxQueryPositionLength));
+
+ assertEmptyStream("no phrase should have a start offset < 0",
+ phrases.stream().filter(p -> p.getOffsetStart() < 0));
+ assertEmptyStream("no phrase should have a start position < 1",
+ phrases.stream().filter(p -> p.getPositionStart() < 1));
+
+ assertEmptyStream("If a phrase has a start offset of 0, then it must have position 1",
+ phrases.stream().filter(p -> (p.getOffsetStart() == 0)
+ && (p.getPositionStart() != 1)));
+
+ final Phrase first = phrases.get(0);
+ final Phrase last = phrases.get(phrases.size()-1);
+
+ assertEmptyStream("no phrase should have a start offset < first phrase",
+ phrases.stream().filter(p -> p.getOffsetStart() < first.getOffsetStart()));
+ assertEmptyStream("no phrase should have an end offset > last phrase",
+ phrases.stream().filter(p -> last.getOffsetEnd() < p.getOffsetEnd()));
+
+ assertEmptyStream("no phrase should have a start position < first phrase",
+ phrases.stream().filter(p -> p.getPositionStart() < first.getPositionStart()));
+ assertEmptyStream("no phrase should have an end position > last phrase",
+ phrases.stream().filter(p -> last.getPositionEnd() < p.getPositionEnd()));
+
+
+ // NOTE: stuff below this point may not be true for all analyzers (ie: stopwords)
+ // but should be valid for the analyzers used in this test...
+ // (if we expand test to cover analyzers w/stopwords, refactor this into a new method)
+
+ for (int n = 1; n <= maxQueryPositionLength; n++) {
+ final int len = n;
+ final int expected = Math.max(0, 1 + inputPositionLength - n);
+ final List sizeN = phrases.stream().filter(p -> p.getPositionLength() == len
+ ).collect(Collectors.toList());
+ assertEquals("Expected # phrases of size " + n + ": " + sizeN, expected, sizeN.size());
+ }
+
+ // check the quantities of sub-terms/phrases...
+ assertEmptyStream("no phrase should have num indexed terms != pos_len",
+ phrases.stream().filter
+ (p -> last.getPositionLength() != last.getIndividualIndexedTerms().size()));
+ assertEmptyStream("no phrase should have num sub-phrases != max(1, 1 + pos_len - "+maxIndexedPositionLength+")",
+ phrases.stream().filter
+ (p -> (Math.max(1, 1 + last.getPositionLength() - maxIndexedPositionLength)
+ != last.getLargestIndexedSubPhrases().size())));
+ // NOTE: indexed super phrases can be of various lengths, and differing quantities near
+ // begining/end of input so don't worry about an exact count, just check their properties (below)
+
+ // check the properties of our sub/super phrases
+ for (Phrase phrase : phrases) {
+ final String debug = phrase.toString();
+
+ assertEmptyStream(debug + " should not have any indexed terms where pos_len != 1",
+ phrase.getIndividualIndexedTerms().stream().filter
+ (term -> 1 != term.getPositionLength()));
+
+ assertEmptyStream(debug + " should not have any sub-phrases where pos_len > min(pos_len, "
+ + maxIndexedPositionLength+")",
+ phrase.getLargestIndexedSubPhrases().stream().filter
+ (inner -> (Math.min(phrase.getPositionLength(), maxIndexedPositionLength)
+ < inner.getPositionLength())));
+
+ assertEmptyStream(debug + " should not have any super-phrases where super.len <= phrase.len or "
+ + maxIndexedPositionLength + " < super.len",
+ phrase.getIndexedSuperPhrases().stream().filter
+ (outer -> (outer.getPositionLength() <= phrase.getPositionLength() ||
+ maxIndexedPositionLength < outer.getPositionLength())));
+ }
+ }
+
+ public void testWhiteboxStats() throws Exception {
+ final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
+ assertNotNull(analysisField);
+ final String input = "BROWN fox lAzY dog xxxyyyzzz";
+
+ // a function we'll re-use on phrases generated from the above input
+ // the multiplier let's us simulate multiple shards returning the same values
+ BiConsumer> assertions = (mult, phrases) -> {
+ final Phrase brown_fox = phrases.get(1);
+ assertEquals("BROWN fox", brown_fox.getSubSequence());
+
+ assertEquals(mult * 1, brown_fox.getTTF("multigrams_title"));
+ assertEquals(mult * 1, brown_fox.getDocFreq("multigrams_title"));
+ assertEquals(mult * 1, brown_fox.getConjunctionDocCount("multigrams_title"));
+
+ assertEquals(mult * 3, brown_fox.getTTF("multigrams_body"));
+ assertEquals(mult * 2, brown_fox.getDocFreq("multigrams_body"));
+ assertEquals(mult * 2, brown_fox.getConjunctionDocCount("multigrams_body"));
+
+ final Phrase fox_lazy = phrases.get(6);
+ assertEquals("fox lAzY", fox_lazy.getSubSequence());
+
+ assertEquals(mult * 0, fox_lazy.getTTF("multigrams_title"));
+ assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_title"));
+ assertEquals(mult * 1, fox_lazy.getConjunctionDocCount("multigrams_title"));
+
+ assertEquals(mult * 0, fox_lazy.getTTF("multigrams_body"));
+ assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_body"));
+ assertEquals(mult * 2, fox_lazy.getConjunctionDocCount("multigrams_body"));
+
+ final Phrase bfld = phrases.get(3);
+ assertEquals("BROWN fox lAzY dog", bfld.getSubSequence());
+
+ expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_title"); });
+ expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_title"); });
+ assertEquals(mult * 0, bfld.getConjunctionDocCount("multigrams_title"));
+
+ expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_body"); });
+ expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_body"); });
+ assertEquals(mult * 1, bfld.getConjunctionDocCount("multigrams_body"));
+
+ final Phrase xyz = phrases.get(phrases.size()-1);
+
+ assertEquals("xxxyyyzzz", xyz.getSubSequence());
+ assertEquals(mult * 0, xyz.getTTF("multigrams_title"));
+ assertEquals(mult * 0, xyz.getDocFreq("multigrams_title"));
+ assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_title"));
+
+ assertEquals(mult * 0, xyz.getTTF("multigrams_body"));
+ assertEquals(mult * 0, xyz.getDocFreq("multigrams_body"));
+ assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_body"));
+ return;
+ };
+
+
+ final List phrasesLocal = Phrase.extractPhrases(input, analysisField, 3, 7);
+
+ // freshly parsed phrases, w/o any stats populated, all the stats should be 0
+ assertions.accept(0, phrasesLocal);
+
+ // If we populate with our index stats, we should get the basic values in our BiConsumer
+ try (SolrQueryRequest req = req()) {
+ Phrase.populateStats(phrasesLocal, Arrays.asList("multigrams_body","multigrams_title"),
+ req.getSearcher());
+ }
+ assertions.accept(1, phrasesLocal);
+
+ // likewise, if we create a new freshly parsed set of phrases, and "merge" in the previous index stats
+ // (ie: merge results from one shard) we should get the same results
+ final List phrasesMerged = Phrase.extractPhrases(input, analysisField, 3, 7);
+ Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
+ assertions.accept(1, phrasesMerged);
+
+ // if we merge in a second copy of the same results (ie: two identical shards)
+ // our results should be double what we had before
+ Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
+ assertions.accept(2, phrasesMerged);
+
+ }
+
+ public void testWhiteboxScores() throws Exception {
+ final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
+ assertNotNull(analysisField);
+ final Map fieldWeights = new TreeMap<>();
+ fieldWeights.put("multigrams_title", 1.0D);
+ fieldWeights.put("multigrams_body", 0.0D); // NOTE: 0 weighting should only affect total score
+
+ final String input = "xxxyyyzzz BROWN fox why are we lAzY";
+ final List phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
+ try (SolrQueryRequest req = req()) {
+ Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
+ }
+ Phrase.populateScores(phrases, fieldWeights, 3, 7);
+
+ // do some basic sanity checks of the field & total scores...
+
+ for (Phrase xyz : phrases.subList(0, 7)) {
+ // first 7 all start with xyz which isn't in index (in either field) so all scores should be -1
+ assertEquals(xyz.toString(), -1.0D, xyz.getTotalScore(), 0.0D);
+ assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_title"), 0.0D);
+ assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_body"), 0.0D);
+ }
+
+ // any individual terms (past xyz) should score 0.0 because they are all actually in the index
+ // (in both fields)
+ for (Phrase term : phrases.subList(7, phrases.size()).stream().filter
+ ((p -> 1 == p.getPositionLength())).collect(Collectors.toList())) {
+
+ assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_title"), 0.0D);
+ assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_body"), 0.0D);
+ assertEquals(term.toString(), 0.0D, term.getTotalScore(), 0.0D);
+ }
+
+ // "brown fox" should score positively in both fields, and overall...
+ final Phrase brown_fox = phrases.get(8);
+ assertEquals("BROWN fox", brown_fox.getSubSequence());
+ assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
+ assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_body"), greaterThan(0.0D) );
+ assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
+
+ // "we lazy" does appear in a title value, but should score poorly given how often the terms
+ // are used in other contexts, and should score -1 against body -- but because of our weights,
+ // that shouldn't bring down the total
+ final Phrase we_lazy = phrases.get(phrases.size()-2);
+ assertEquals("we lAzY", we_lazy.getSubSequence());
+ assertEquals(we_lazy.toString(), -1.0D, we_lazy.getFieldScore("multigrams_body"), 0.0D);
+ assertThat(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), lessThan(0.0D));
+ assertThat(we_lazy.toString(), we_lazy.getTotalScore(), lessThan(0.0D));
+ assertEquals(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), we_lazy.getTotalScore(),
+ 0.0D);
+
+ // "why are we lazy" is longer then the max indexed phrase size & appears verbatim in a title value
+ // it should score -1 against body -- but because of our weights, that shouldn't bring down the total
+ final Phrase wawl = phrases.get(phrases.size()-7);
+ assertEquals("why are we lAzY", wawl.getSubSequence());
+ assertEquals(wawl.toString(), -1.0D, wawl.getFieldScore("multigrams_body"), 0.0D);
+ assertThat(wawl.toString(), wawl.getFieldScore("multigrams_title"), greaterThan(0.0D));
+ assertThat(wawl.toString(), wawl.getTotalScore(), greaterThan(0.0D));
+ assertEquals(wawl.toString(), wawl.getFieldScore("multigrams_title"), wawl.getTotalScore(),
+ 0.0D);
+
+ // "brown fox why are we" is longer then the max indexed phrase, and none of it's
+ // (longest) sub phrases exists in either field -- so all of it's scores should be -1
+ final Phrase bfwaw = phrases.get(11);
+ assertEquals("BROWN fox why are we", bfwaw.getSubSequence());
+ assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_title"), 0.0D);
+ assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_body"), 0.0D);
+ assertEquals(bfwaw.toString(), -1.0D, bfwaw.getTotalScore(), 0.0D);
+
+ }
+
+ public void testWhiteboxScorcesStopwords() throws Exception {
+ final String input = "why the lazy dog brown fox";
+ final Map fieldWeights = new TreeMap<>();
+ fieldWeights.put("multigrams_title", 1.0D);
+ fieldWeights.put("multigrams_title_stop", 1.0D);
+
+ { // If our analysisField uses all terms,
+ // be we also generate scores from a field that filters stopwords...
+ final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title");
+ assertNotNull(analysisField);
+
+ final List phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
+ try (SolrQueryRequest req = req()) {
+ Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
+ }
+ Phrase.populateScores(phrases, fieldWeights, 3, 7);
+
+ // phrases that span the stop word should have valid scores from the field that doesn't care
+ // about stop words, but the stopword field should reject them
+ final Phrase why_the_lazy = phrases.get(2);
+ assertEquals("why the lazy", why_the_lazy.getSubSequence());
+ assertThat(why_the_lazy.toString(), why_the_lazy.getFieldScore("multigrams_title"), greaterThan(0.0D) );
+ assertEquals(why_the_lazy.toString(), -1.0D, why_the_lazy.getFieldScore("multigrams_title_stop"), 0.0D);
+
+ final Phrase the_lazy_dog = phrases.get(8);
+ assertEquals("the lazy dog", the_lazy_dog.getSubSequence());
+ assertThat(the_lazy_dog.toString(), the_lazy_dog.getFieldScore("multigrams_title"), greaterThan(0.0D) );
+ assertEquals(the_lazy_dog.toString(), -1.0D, the_lazy_dog.getFieldScore("multigrams_title_stop"), 0.0D);
+
+ // sanity check that good scores are still possible with stopwords
+ // "brown fox" should score positively in both fields, and overall...
+ final Phrase brown_fox = phrases.get(phrases.size()-2);
+ assertEquals("brown fox", brown_fox.getSubSequence());
+ assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
+ assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
+ assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
+ }
+
+ { // now flip things: our analysisField filters stopwords,
+ // but we also generates scores from a field that doesn't know about them...
+ //
+ // (NOTE: the parser will still generate _some_ candidate phrases spaning the stop word position,
+ // but not ones that start with the stopword)
+ final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title_stop");
+ assertNotNull(analysisField);
+
+ final List phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
+ try (SolrQueryRequest req = req()) {
+ Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
+ }
+ Phrase.populateScores(phrases, fieldWeights, 3, 7);
+ assertTrue(phrases.toString(), 0 < phrases.size());
+
+ for (Phrase p : phrases) {
+ if (p.getPositionStart() <= 2 && 2 < p.getPositionEnd()) {
+ // phrases that span the stop word should have valid scores from the field that doesn't care
+ // about stop words, but the stopword field should reject them
+ assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title"), 0.0D);
+ assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title_stop"), 0.0D);
+ }
+ }
+
+ // sanity check that good scores are still possible with stopwords
+ // "brown fox" should score positively in both fields, and overall...
+ final Phrase brown_fox = phrases.get(phrases.size()-2);
+ assertEquals("brown fox", brown_fox.getSubSequence());
+ assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
+ assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
+ assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
+ }
+
+ }
+
+ public void testExpectedUserErrors() throws Exception {
+ assertQEx("empty field list should error",
+ "must specify a (weighted) list of fields",
+ req("q","foo", "phrases","true",
+ "phrases.fields", " "),
+ ErrorCode.BAD_REQUEST);
+
+ assertQEx("bogus field name should error",
+ "does not exist",
+ req("q","foo", "phrases","true",
+ "phrases.fields", "bogus1 bogus2"),
+ ErrorCode.BAD_REQUEST);
+
+ assertQEx("lack of shingles should cause error",
+ "Unable to determine max position length",
+ req("q","foo", "phrases","true",
+ "phrases.fields", "title"),
+ ErrorCode.BAD_REQUEST);
+
+ assertQEx("analyzer missmatch should cause error",
+ "must have the same fieldType",
+ req("q","foo", "phrases","true",
+ "phrases.fields", "multigrams_title multigrams_title_short"),
+ ErrorCode.BAD_REQUEST);
+
+ assertQEx("analysis field must exist",
+ "does not exist",
+ req("q","foo", "phrases","true",
+ "phrases.analysis.field", "bogus",
+ "phrases.fields", "multigrams_title multigrams_title_short"),
+ ErrorCode.BAD_REQUEST);
+
+ assertQEx("no query param should error",
+ "requires a query string",
+ req("qt", "/phrases",
+ "phrases.fields", "multigrams_title"),
+ ErrorCode.BAD_REQUEST);
+ }
+
+ public void testMaxShingleSizeHelper() throws Exception {
+ IndexSchema schema = h.getCore().getLatestSchema();
+
+ assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
+ (schema.getFieldTypeByName("multigrams_3_7").getIndexAnalyzer()));
+ assertEquals(7, PhrasesIdentificationComponent.getMaxShingleSize
+ (schema.getFieldTypeByName("multigrams_3_7").getQueryAnalyzer()));
+
+ assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
+ (schema.getFieldTypeByName("multigrams_3").getIndexAnalyzer()));
+ assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
+ (schema.getFieldTypeByName("multigrams_3").getQueryAnalyzer()));
+
+ assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
+ (schema.getFieldTypeByName("text").getIndexAnalyzer()));
+ assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
+ (schema.getFieldTypeByName("text").getQueryAnalyzer()));
+
+ }
+
+ public void testSimplePhraseRequest() throws Exception {
+ final String input = " did a Quick brown FOX perniciously jump over the lazy dog";
+ final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}";
+
+ // should get same behavior regardless of wether we use "q" or "phrases.q"
+ for (String p : Arrays.asList("q", "phrases.q")) {
+ // basic request...
+ assertQ(req("qt", HANDLER, p, input)
+ // expect no search results...
+ , "count(//result)=0"
+
+ // just phrase info...
+ , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
+ , "//lst[@name='phrases']/str[@name='summary'][.='"+expected+"']"
+ , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 2"
+ //
+ , "//lst[@name='phrases']/arr[@name='details']/lst[1]/str[@name='text'][.='the lazy dog']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_start'][.='50']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_end'][.='62']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst[1]/double[@name='score'][number(.) > 0]"
+ //
+ , "//lst[@name='phrases']/arr[@name='details']/lst[2]/str[@name='text'][.='brown FOX']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_start'][.='17']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_end'][.='26']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst[2]/double[@name='score'][number(.) > 0]"
+ );
+
+ // empty input, empty phrases (and no error)...
+ assertQ(req("qt", HANDLER, p, "")
+ // expect no search results...
+ , "count(//result)=0"
+ // just empty phrase info for our empty input...
+ , "//lst[@name='phrases']/str[@name='input'][.='']"
+ , "//lst[@name='phrases']/str[@name='summary'][.='']"
+ , "count(//lst[@name='phrases']/arr[@name='details']) = 1"
+ , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
+ );
+ }
+ }
+
+ public void testSimpleSearchRequests() throws Exception {
+ final String input = "\"brown fox\"";
+
+ assertQ(req("q", input)
+ // basic search should have worked...
+ , "//result[@numFound='2']"
+ , "//result/doc/str[@name='id'][.='42']"
+ , "//result/doc/str[@name='id'][.='43']"
+ // and phrases should not be returned since they weren't requested...
+ , "0=count(//lst[@name='phrases'])"
+ );
+
+ assertQ(req("phrases", "false", "q", input)
+ // basic search should have worked...
+ , "//result[@numFound='2']"
+ , "//result/doc/str[@name='id'][.='42']"
+ , "//result/doc/str[@name='id'][.='43']"
+ // and phrases should not be returned since they were explicitly disabled...
+ , "0=count(//lst[@name='phrases'])"
+ );
+
+ // with input this short, all of these permutations of requests should produce the same output...
+ for (SolrQueryRequest req : Arrays.asList
+ ( // simple, using 3/7 defaults
+ req("phrases","true", "q", input),
+
+ // simple, using just the 3/3 'short' fields
+ req("phrases","true", "q", input,
+ "phrases.fields", "multigrams_body_short multigrams_title_short^2"),
+
+ // diff analysers, but explicit override using 3/3 "short" field...
+ req("phrases","true", "q", input,
+ "phrases.fields", "multigrams_body multigrams_title_short^2",
+ "phrases.analysis.field", "multigrams_title_short"))) {
+ assertQ(req
+ // basic search should have worked...
+ , "//result[@numFound='2']"
+ , "//result/doc/str[@name='id'][.='42']"
+ , "//result/doc/str[@name='id'][.='43']"
+
+ // and we should have gotten phrase info...
+ , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
+ , "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
+ , "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
+ );
+ }
+
+ // override the query string to get different phrases
+ assertQ(req("phrases","true", "q", "*:*", "phrases.q", input)
+ // basic search should have found all docs...
+ , "//result[@numFound='4']"
+ // and we should have gotten phrase info for our alternative q string...
+ , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
+ , "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
+ , "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
+ , "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
+ );
+
+ // empty input, empty phrases (but no error)
+ assertQ(req("phrases","true", "q", "*:*", "phrases.q", "")
+ // basic search should have found all docs...
+ , "//result[@numFound='4']"
+ // and we should have gotten (empty) phrase info for our alternative q string...
+ , "//lst[@name='phrases']/str[@name='input'][.='']"
+ , "//lst[@name='phrases']/str[@name='summary'][.='']"
+ , "count(//lst[@name='phrases']/arr[@name='details']) = 1"
+ , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
+ );
+ }
+
+ public void testGreyboxShardSearchRequests() throws Exception {
+ final String input = "quick brown fox ran";
+
+ final String phrase_xpath = "//lst[@name='phrases']";
+ final String all_phrase_xpath = phrase_xpath + "/arr[@name='_all']";
+
+ // phrases requested, and correct request stage / shard purpose ...
+ assertQ(req("q", input,
+ "phrases","true",
+ ShardParams.IS_SHARD, "true",
+ ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
+
+ // this shard request should have caused stats to be returned about all phrases...
+ , "10=count("+ all_phrase_xpath +"/lst)"
+ // "quick" ...
+ , all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_body'][.='1']"
+ , all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_title'][.='0']"
+ // ...
+ // "brown fox"
+ , all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_body'][.='3']"
+ , all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_title'][.='1']"
+ , all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_body'][.='2']"
+ , all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_title'][.='1']"
+ , all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_body'][.='2']"
+ , all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_title'][.='1']"
+
+ // but no computed "scores"...
+ , "0=count("+phrase_xpath+"//*[@name='score'])"
+ );
+
+ // phrases requested, but incorrect request stage / shard purpose ...
+ assertQ(req("q", input,
+ "phrases","true",
+ ShardParams.IS_SHARD, "true",
+ ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
+ , "0=count("+ phrase_xpath +"/lst)");
+
+ // phrases disabled, regardless of request stage / shard purpose ...
+ assertTrue("sanity check failed, stage was modified in code w/o updating test",
+ PhrasesIdentificationComponent.SHARD_PURPOSE != ShardRequest.PURPOSE_GET_FIELDS);
+ assertQ(req("q", input,
+ "phrases","false",
+ ShardParams.IS_SHARD, "true",
+ ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
+ , "0=count("+ phrase_xpath +"/lst)");
+ assertQ(req("q", input,
+ "phrases","false",
+ ShardParams.IS_SHARD, "true",
+ ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
+ , "0=count("+ phrase_xpath +"/lst)");
+ }
+
+
+
+ // ////////////////////////////////////////////////////////////////
+
+
+
+
+ /**
+ * Trivial Helper method that collects & compares to an empty List so
+ * the assertion shows the unexpected stream elements
+ */
+ public void assertEmptyStream(final String msg, final Stream extends T> stream) {
+ assertEquals(msg,
+ Collections.emptyList(),
+ stream.collect(Collectors.toList()));
+ }
+
+ /** helper, docs for future junit/hamcrest seems to have something similar */
+ public static Matcher lessThan(double expected) {
+ return new BaseMatcher() {
+ @Override public boolean matches(Object actual) {
+ return ((Double)actual).compareTo(expected) < 0;
+ }
+ @Override public void describeTo(Description d) {
+ d.appendText("should be less than " + expected);
+ }
+ };
+ }
+ /** helper, docs for future junit/hamcrest seems to have something similar */
+ public static Matcher greaterThan(double expected) {
+ return new BaseMatcher() {
+ @Override public boolean matches(Object actual) {
+ return 0 < ((Double)actual).compareTo(expected);
+ }
+ @Override public void describeTo(Description d) {
+ d.appendText("should be greater than " + expected);
+ }
+ };
+ }
+}