mirror of https://github.com/apache/lucene.git
SOLR-9418: Added a new (experimental) PhrasesIdentificationComponent for identifying potential phrases in query input based on overlapping shingles in the index
This commit is contained in:
parent
cac589b803
commit
597bd5db77
|
@ -1,4 +1,4 @@
|
|||
Apache Solr Release Notes
|
||||
Apache Solr Release Notes
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
@ -208,6 +208,9 @@ New Features
|
|||
doc transformers if present. In 7.5 a missing 'fl' defaults to the current behavior of all fields, but in 8.0
|
||||
defaults to the top/request "fl". (Moshe Bla, David Smiley)
|
||||
|
||||
* SOLR-9418: Added a new (experimental) PhrasesIdentificationComponent for identifying potential phrases
|
||||
in query input based on overlapping shingles in the index. (Akash Mehta, Trey Grainger, hossman)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,97 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<schema name="phrase-identification" version="1.6">
|
||||
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
|
||||
<field name="_version_" type="long" indexed="true" stored="true"/>
|
||||
|
||||
<field name="title" type="text" indexed="true" stored="true" />
|
||||
<field name="body" type="text" indexed="true" stored="true" />
|
||||
|
||||
<field name="multigrams_title" type="multigrams_3_7" indexed="true" stored="false" />
|
||||
<field name="multigrams_body" type="multigrams_3_7" indexed="true" stored="false" />
|
||||
|
||||
<field name="multigrams_title_short" type="multigrams_3" indexed="true" stored="false" />
|
||||
<field name="multigrams_body_short" type="multigrams_3" indexed="true" stored="false" />
|
||||
|
||||
<field name="multigrams_title_stop" type="multigrams_3_7_stop" indexed="true" stored="false" />
|
||||
|
||||
<copyField source="title" dest="multigrams_title" />
|
||||
<copyField source="title" dest="multigrams_title_short" />
|
||||
<copyField source="title" dest="multigrams_title_stop" />
|
||||
|
||||
<copyField source="body" dest="multigrams_body_short" />
|
||||
<copyField source="body" dest="multigrams_body" />
|
||||
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<fieldType name="text" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="multigrams_3_7" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="multigrams_3" class="solr.TextField" positionIncrementGap="100">
|
||||
<!-- only one analyzer -->
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="multigrams_3_7_stop" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
|
||||
|
||||
</schema>
|
|
@ -0,0 +1,53 @@
|
|||
<?xml version="1.0" ?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<config>
|
||||
<luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
|
||||
<schemaFactory class="ClassicIndexSchemaFactory"/>
|
||||
<xi:include href="solrconfig.snippet.randomindexconfig.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
|
||||
|
||||
<searchComponent class="solr.PhrasesIdentificationComponent" name="phrases" />
|
||||
|
||||
<!-- our default search handler should suggest phrases in addition to doing queries if requested -->
|
||||
<requestHandler name="/select" class="solr.SearchHandler">
|
||||
<arr name="last-components">
|
||||
<str>phrases</str>
|
||||
</arr>
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
<str name="indent">true</str>
|
||||
<str name="df">body</str>
|
||||
<str name="phrases.fields">multigrams_body multigrams_title^2</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
<!-- a custom handler should support exclusively giving phrases w/o doing a query -->
|
||||
<requestHandler name="/phrases" class="solr.SearchHandler">
|
||||
<arr name="components">
|
||||
<str>phrases</str>
|
||||
</arr>
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
<str name="indent">true</str>
|
||||
<bool name="phrases">true</bool>
|
||||
<str name="phrases.fields">multigrams_body multigrams_title^2</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
</config>
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.cloud;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* A very simple sanity check that Phrase Identification works across a cloud cluster
|
||||
* using distributed term stat collection.
|
||||
*
|
||||
* @see org.apache.solr.handler.component.PhrasesIdentificationComponentTest
|
||||
*/
|
||||
@Slow
|
||||
public class TestCloudPhrasesIdentificationComponent extends SolrCloudTestCase {
|
||||
|
||||
private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName();
|
||||
private static final String COLLECTION_NAME = DEBUG_LABEL + "_collection";
|
||||
|
||||
/** A basic client for operations at the cloud level, default collection will be set */
|
||||
private static CloudSolrClient CLOUD_CLIENT;
|
||||
/** One client per node */
|
||||
private static ArrayList<HttpSolrClient> CLIENTS = new ArrayList<>(5);
|
||||
|
||||
@BeforeClass
|
||||
private static void createMiniSolrCloudCluster() throws Exception {
|
||||
|
||||
// multi replicas should not matter...
|
||||
final int repFactor = usually() ? 1 : 2;
|
||||
// ... but we definitely want to test multiple shards
|
||||
final int numShards = TestUtil.nextInt(random(), 1, (usually() ? 2 :3));
|
||||
final int numNodes = (numShards * repFactor);
|
||||
|
||||
final String configName = DEBUG_LABEL + "_config-set";
|
||||
final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf");
|
||||
|
||||
configureCluster(numNodes).addConfig(configName, configDir).configure();
|
||||
|
||||
Map<String, String> collectionProperties = new LinkedHashMap<>();
|
||||
collectionProperties.put("config", "solrconfig-phrases-identification.xml");
|
||||
collectionProperties.put("schema", "schema-phrases-identification.xml");
|
||||
CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor)
|
||||
.setProperties(collectionProperties)
|
||||
.process(cluster.getSolrClient());
|
||||
|
||||
CLOUD_CLIENT = cluster.getSolrClient();
|
||||
CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME);
|
||||
|
||||
waitForRecoveriesToFinish(CLOUD_CLIENT);
|
||||
|
||||
for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
|
||||
CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/"));
|
||||
}
|
||||
|
||||
// index some docs...
|
||||
CLOUD_CLIENT.add
|
||||
(sdoc("id", "42",
|
||||
"title","Tale of the Brown Fox: was he lazy?",
|
||||
"body", "No. The quick brown fox was a very brown fox who liked to get into trouble."));
|
||||
CLOUD_CLIENT.add
|
||||
(sdoc("id", "43",
|
||||
"title","A fable in two acts",
|
||||
"body", "The brOwn fOx jumped. The lazy dog did not"));
|
||||
CLOUD_CLIENT.add
|
||||
(sdoc("id", "44",
|
||||
"title","Why the LazY dog was lazy",
|
||||
"body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox"));
|
||||
CLOUD_CLIENT.add
|
||||
(sdoc("id", "45",
|
||||
"title","Why Are We Lazy?",
|
||||
"body", "Because we are. that's why"));
|
||||
CLOUD_CLIENT.commit();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
private static void afterClass() throws Exception {
|
||||
CLOUD_CLIENT.close(); CLOUD_CLIENT = null;
|
||||
for (HttpSolrClient client : CLIENTS) {
|
||||
client.close();
|
||||
}
|
||||
CLIENTS = null;
|
||||
}
|
||||
|
||||
public void testBasicPhrases() throws Exception {
|
||||
final String input = " did a Quick brown FOX perniciously jump over the lazy dog";
|
||||
final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}";
|
||||
|
||||
// based on the documents indexed, these assertions should all pass regardless of
|
||||
// how many shards we have, or wether the request is done via /phrases or /select...
|
||||
for (String path : Arrays.asList("/select", "/phrases")) {
|
||||
// ... or if we muck with "q" and use the alternative phrases.q for the bits we care about...
|
||||
for (SolrParams p : Arrays.asList(params("q", input, "phrases", "true"),
|
||||
params("q", "*:*", "phrases.q", input, "phrases", "true"),
|
||||
params("q", "-*:*", "phrases.q", input, "phrases", "true"))) {
|
||||
final QueryRequest req = new QueryRequest(p);
|
||||
req.setPath(path);
|
||||
final QueryResponse rsp = req.process(getRandClient(random()));
|
||||
try {
|
||||
NamedList<Object> phrases = (NamedList<Object>) rsp.getResponse().get("phrases");
|
||||
assertEquals("input", input, phrases.get("input"));
|
||||
assertEquals("summary", expected, phrases.get("summary"));
|
||||
|
||||
final List<NamedList<Object>> details = (List<NamedList<Object>>) phrases.get("details");
|
||||
assertNotNull("null details", details);
|
||||
assertEquals("num phrases found", 2, details.size());
|
||||
|
||||
final NamedList<Object> lazy_dog = details.get(0);
|
||||
assertEquals("dog text", "the lazy dog", lazy_dog.get("text"));
|
||||
assertEquals("dog score", 0.166666D, ((Double)lazy_dog.get("score")).doubleValue(), 0.000001D);
|
||||
|
||||
final NamedList<Object> brown_fox = details.get(1);
|
||||
assertEquals("fox text", "brown FOX", brown_fox.get("text"));
|
||||
assertEquals("fox score", 0.083333D, ((Double)brown_fox.get("score")).doubleValue(), 0.000001D);
|
||||
|
||||
} catch (AssertionError e) {
|
||||
throw new AssertionError(e.getMessage() + " ::: " + path + " ==> " + rsp, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyInput() throws Exception {
|
||||
// empty input shouldn't error, just produce empty results...
|
||||
for (String input : Arrays.asList("", " ")) {
|
||||
for (SolrParams p : Arrays.asList(params("q", "*:*", "phrases.q", input, "phrases", "true"),
|
||||
params("q", "-*:*", "phrases.q", input, "phrases", "true"))) {
|
||||
final QueryRequest req = new QueryRequest(p);
|
||||
req.setPath("/phrases");
|
||||
final QueryResponse rsp = req.process(getRandClient(random()));
|
||||
try {
|
||||
NamedList<Object> phrases = (NamedList<Object>) rsp.getResponse().get("phrases");
|
||||
assertEquals("input", input, phrases.get("input"));
|
||||
assertEquals("summary", input, phrases.get("summary"));
|
||||
|
||||
final List<NamedList<Object>> details = (List<NamedList<Object>>) phrases.get("details");
|
||||
assertNotNull("null details", details);
|
||||
assertEquals("num phrases found", 0, details.size());
|
||||
|
||||
} catch (AssertionError e) {
|
||||
throw new AssertionError(e.getMessage() + " ==> " + rsp, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed
|
||||
* at a node in our cluster
|
||||
*/
|
||||
public static SolrClient getRandClient(Random rand) {
|
||||
int numClients = CLIENTS.size();
|
||||
int idx = TestUtil.nextInt(rand, 0, numClients);
|
||||
|
||||
return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx);
|
||||
}
|
||||
|
||||
public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception {
|
||||
assert null != client.getDefaultCollection();
|
||||
AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(),
|
||||
client.getZkStateReader(),
|
||||
true, true, 330);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,796 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.component;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.handler.component.PhrasesIdentificationComponent;
|
||||
import org.apache.solr.handler.component.PhrasesIdentificationComponent.Phrase;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.common.params.ShardParams;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
|
||||
import org.junit.After;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Before;
|
||||
import org.hamcrest.Description;
|
||||
import org.hamcrest.Matcher;
|
||||
import org.hamcrest.BaseMatcher;
|
||||
|
||||
public class PhrasesIdentificationComponentTest extends SolrTestCaseJ4 {
|
||||
|
||||
private static final String HANDLER = "/phrases";
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-phrases-identification.xml","schema-phrases-identification.xml");
|
||||
}
|
||||
|
||||
@Before
|
||||
public void addSomeDocs() throws Exception {
|
||||
assertU(adoc("id", "42",
|
||||
"title","Tale of the Brown Fox: was he lazy?",
|
||||
"body", "No. The quick brown fox was a very brown fox who liked to get into trouble."));
|
||||
assertU(adoc("id", "43",
|
||||
"title","A fable in two acts",
|
||||
"body", "The brOwn fOx jumped. The lazy dog did not"));
|
||||
assertU(adoc("id", "44",
|
||||
"title","Why the LazY dog was lazy",
|
||||
"body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox"));
|
||||
assertU(adoc("id", "45",
|
||||
"title","Why Are We Lazy?",
|
||||
"body", "Because we are. that's why"));
|
||||
assertU((commit()));
|
||||
}
|
||||
|
||||
@After
|
||||
public void deleteAllDocs() throws Exception {
|
||||
assertU(delQ("*:*"));
|
||||
assertU((commit()));
|
||||
}
|
||||
|
||||
public void testWhiteBoxPhraseParsingLongInput() throws Exception {
|
||||
final SchemaField field = h.getCore().getLatestSchema().getField("multigrams_body");
|
||||
assertNotNull(field);
|
||||
final List<Phrase> phrases = Phrase.extractPhrases
|
||||
(" did a Quick brown FOX perniciously jump over the lAZy dog", field, 3, 7);
|
||||
|
||||
assertEquals(IntStream.rangeClosed((11-7+1), 11).sum(), // 11 words, max query phrase size is 7
|
||||
phrases.size());
|
||||
|
||||
// spot check a few explicitly choosen phrases of various lengths...
|
||||
|
||||
{ // single term, close to edge so not as many super phrases as other terms might have
|
||||
final Phrase lazy = phrases.get(phrases.size() - 1 - 2);
|
||||
final String debug = lazy.toString();
|
||||
|
||||
assertEquals(debug, "lAZy", lazy.getSubSequence());
|
||||
assertEquals(debug, 10, lazy.getPositionStart());
|
||||
assertEquals(debug, 11, lazy.getPositionEnd());
|
||||
assertEquals(debug, 1, lazy.getPositionLength());
|
||||
|
||||
assertEquals(debug, 54, lazy.getOffsetStart());
|
||||
assertEquals(debug, 58, lazy.getOffsetEnd());
|
||||
|
||||
assertEquals(debug, 1, lazy.getIndividualIndexedTerms().size());
|
||||
assertEquals(debug, 1, lazy.getLargestIndexedSubPhrases().size());
|
||||
assertEquals(debug, lazy, lazy.getIndividualIndexedTerms().get(0));
|
||||
assertEquals(debug, lazy, lazy.getLargestIndexedSubPhrases().get(0));
|
||||
assertEquals(debug, 4, lazy.getIndexedSuperPhrases().size()); // (2 each: len=2, len=3)
|
||||
}
|
||||
{ // length 2, middle of the pack
|
||||
final Phrase brown_fox = phrases.get((7 * 3) + 1);
|
||||
final String debug = brown_fox.toString();
|
||||
|
||||
assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
|
||||
assertEquals(debug, 4, brown_fox.getPositionStart());
|
||||
assertEquals(debug, 6, brown_fox.getPositionEnd());
|
||||
assertEquals(debug, 2, brown_fox.getPositionLength());
|
||||
|
||||
assertEquals(debug, 17, brown_fox.getOffsetStart());
|
||||
assertEquals(debug, 26, brown_fox.getOffsetEnd());
|
||||
|
||||
assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
|
||||
assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
|
||||
assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
|
||||
assertEquals(debug, 2, brown_fox.getIndexedSuperPhrases().size()); // (2 @ len=3)
|
||||
|
||||
}
|
||||
{ // length 3 (which is the max indexed size) @ start of the string
|
||||
final Phrase daq = phrases.get(2);
|
||||
final String debug = daq.toString();
|
||||
|
||||
assertEquals(debug, "did a Quick", daq.getSubSequence());
|
||||
assertEquals(debug, 1, daq.getPositionStart());
|
||||
assertEquals(debug, 4, daq.getPositionEnd());
|
||||
assertEquals(debug, 3, daq.getPositionLength());
|
||||
|
||||
assertEquals(debug, 1, daq.getOffsetStart());
|
||||
assertEquals(debug, 13, daq.getOffsetEnd());
|
||||
|
||||
assertEquals(debug, 3, daq.getIndividualIndexedTerms().size());
|
||||
assertEquals(debug, 1, daq.getLargestIndexedSubPhrases().size());
|
||||
assertEquals(debug, daq, daq.getLargestIndexedSubPhrases().get(0));
|
||||
assertEquals(debug, 0, daq.getIndexedSuperPhrases().size());
|
||||
}
|
||||
{ // length 4 phrase (larger then the max indexed size)
|
||||
final Phrase qbfp = phrases.get((7 * 2) + 3);
|
||||
final String debug = qbfp.toString();
|
||||
|
||||
assertEquals(debug, "Quick brown FOX perniciously", qbfp.getSubSequence());
|
||||
assertEquals(debug, 3, qbfp.getPositionStart());
|
||||
assertEquals(debug, 7, qbfp.getPositionEnd());
|
||||
assertEquals(debug, 4, qbfp.getPositionLength());
|
||||
|
||||
assertEquals(debug, 8, qbfp.getOffsetStart());
|
||||
assertEquals(debug, 39, qbfp.getOffsetEnd());
|
||||
|
||||
assertEquals(debug, 4, qbfp.getIndividualIndexedTerms().size());
|
||||
assertEquals(debug, 2, qbfp.getLargestIndexedSubPhrases().size());
|
||||
assertEquals(debug, 0, qbfp.getIndexedSuperPhrases().size());
|
||||
}
|
||||
|
||||
// some blanket assumptions about the results...
|
||||
assertBasicSanityChecks(phrases, 11, 3, 7);
|
||||
}
|
||||
|
||||
public void testWhiteBoxPhraseParsingShortInput() throws Exception {
|
||||
// for input this short, either of these fields should be (mostly) equivilent
|
||||
final Map<String,Integer> fields = new TreeMap<>();
|
||||
fields.put("multigrams_body", 7);
|
||||
fields.put("multigrams_body_short", 3);
|
||||
for (Map.Entry<String,Integer> entry : fields.entrySet()) {
|
||||
try {
|
||||
final int maxQ = entry.getValue();
|
||||
final SchemaField field = h.getCore().getLatestSchema().getField(entry.getKey());
|
||||
assertNotNull(field);
|
||||
|
||||
// empty input shouldn't break anything
|
||||
assertEquals(0, Phrase.extractPhrases(random().nextBoolean() ? "" : " ", field, 3, maxQ).size());
|
||||
|
||||
// input shorter them our index/query phrase sizes shouldn't break anything either....
|
||||
final List<Phrase> phrases = Phrase.extractPhrases("brown FOX", field, 3, maxQ);
|
||||
|
||||
assertEquals(3, phrases.size());
|
||||
|
||||
{ // length 2
|
||||
final Phrase brown_fox = phrases.get(1);
|
||||
final String debug = brown_fox.toString();
|
||||
|
||||
assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
|
||||
assertEquals(debug, 1, brown_fox.getPositionStart());
|
||||
assertEquals(debug, 3, brown_fox.getPositionEnd());
|
||||
assertEquals(debug, 2, brown_fox.getPositionLength());
|
||||
|
||||
assertEquals(debug, 0, brown_fox.getOffsetStart());
|
||||
assertEquals(debug, 9, brown_fox.getOffsetEnd());
|
||||
|
||||
assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
|
||||
assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
|
||||
assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
|
||||
assertEquals(debug, 0, brown_fox.getIndexedSuperPhrases().size());
|
||||
}
|
||||
{ // length 1
|
||||
final Phrase fox = phrases.get(2);
|
||||
final String debug = fox.toString();
|
||||
|
||||
assertEquals(debug, "FOX", fox.getSubSequence());
|
||||
assertEquals(debug, 2, fox.getPositionStart());
|
||||
assertEquals(debug, 3, fox.getPositionEnd());
|
||||
assertEquals(debug, 1, fox.getPositionLength());
|
||||
|
||||
assertEquals(debug, 6, fox.getOffsetStart());
|
||||
assertEquals(debug, 9, fox.getOffsetEnd());
|
||||
|
||||
assertEquals(debug, 1, fox.getIndividualIndexedTerms().size());
|
||||
assertEquals(debug, 1, fox.getLargestIndexedSubPhrases().size());
|
||||
assertEquals(debug, fox, fox.getLargestIndexedSubPhrases().get(0));
|
||||
assertEquals(debug, 1, fox.getIndexedSuperPhrases().size());
|
||||
}
|
||||
|
||||
assertBasicSanityChecks(phrases, 2, 3, maxQ);
|
||||
} catch (AssertionError e) {
|
||||
throw new AssertionError(entry.getKey() + " => " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts some basic rules that should be enforced about all Phrases
|
||||
* & their linkages to oher phrases
|
||||
*/
|
||||
private void assertBasicSanityChecks(final List<Phrase> phrases,
|
||||
final int inputPositionLength,
|
||||
final int maxIndexedPositionLength,
|
||||
final int maxQueryPositionLength) throws Exception {
|
||||
assert 0 < phrases.size() : "Don't use this method if phrases might be empty";
|
||||
|
||||
assertEmptyStream("no phrase should be longer then "+maxQueryPositionLength+" positions",
|
||||
phrases.stream().filter(p -> p.getPositionLength() > maxQueryPositionLength));
|
||||
|
||||
assertEmptyStream("no phrase should have a start offset < 0",
|
||||
phrases.stream().filter(p -> p.getOffsetStart() < 0));
|
||||
assertEmptyStream("no phrase should have a start position < 1",
|
||||
phrases.stream().filter(p -> p.getPositionStart() < 1));
|
||||
|
||||
assertEmptyStream("If a phrase has a start offset of 0, then it must have position 1",
|
||||
phrases.stream().filter(p -> (p.getOffsetStart() == 0)
|
||||
&& (p.getPositionStart() != 1)));
|
||||
|
||||
final Phrase first = phrases.get(0);
|
||||
final Phrase last = phrases.get(phrases.size()-1);
|
||||
|
||||
assertEmptyStream("no phrase should have a start offset < first phrase",
|
||||
phrases.stream().filter(p -> p.getOffsetStart() < first.getOffsetStart()));
|
||||
assertEmptyStream("no phrase should have an end offset > last phrase",
|
||||
phrases.stream().filter(p -> last.getOffsetEnd() < p.getOffsetEnd()));
|
||||
|
||||
assertEmptyStream("no phrase should have a start position < first phrase",
|
||||
phrases.stream().filter(p -> p.getPositionStart() < first.getPositionStart()));
|
||||
assertEmptyStream("no phrase should have an end position > last phrase",
|
||||
phrases.stream().filter(p -> last.getPositionEnd() < p.getPositionEnd()));
|
||||
|
||||
|
||||
// NOTE: stuff below this point may not be true for all analyzers (ie: stopwords)
|
||||
// but should be valid for the analyzers used in this test...
|
||||
// (if we expand test to cover analyzers w/stopwords, refactor this into a new method)
|
||||
|
||||
for (int n = 1; n <= maxQueryPositionLength; n++) {
|
||||
final int len = n;
|
||||
final int expected = Math.max(0, 1 + inputPositionLength - n);
|
||||
final List<Phrase> sizeN = phrases.stream().filter(p -> p.getPositionLength() == len
|
||||
).collect(Collectors.toList());
|
||||
assertEquals("Expected # phrases of size " + n + ": " + sizeN, expected, sizeN.size());
|
||||
}
|
||||
|
||||
// check the quantities of sub-terms/phrases...
|
||||
assertEmptyStream("no phrase should have num indexed terms != pos_len",
|
||||
phrases.stream().filter
|
||||
(p -> last.getPositionLength() != last.getIndividualIndexedTerms().size()));
|
||||
assertEmptyStream("no phrase should have num sub-phrases != max(1, 1 + pos_len - "+maxIndexedPositionLength+")",
|
||||
phrases.stream().filter
|
||||
(p -> (Math.max(1, 1 + last.getPositionLength() - maxIndexedPositionLength)
|
||||
!= last.getLargestIndexedSubPhrases().size())));
|
||||
// NOTE: indexed super phrases can be of various lengths, and differing quantities near
|
||||
// begining/end of input so don't worry about an exact count, just check their properties (below)
|
||||
|
||||
// check the properties of our sub/super phrases
|
||||
for (Phrase phrase : phrases) {
|
||||
final String debug = phrase.toString();
|
||||
|
||||
assertEmptyStream(debug + " should not have any indexed terms where pos_len != 1",
|
||||
phrase.getIndividualIndexedTerms().stream().filter
|
||||
(term -> 1 != term.getPositionLength()));
|
||||
|
||||
assertEmptyStream(debug + " should not have any sub-phrases where pos_len > min(pos_len, "
|
||||
+ maxIndexedPositionLength+")",
|
||||
phrase.getLargestIndexedSubPhrases().stream().filter
|
||||
(inner -> (Math.min(phrase.getPositionLength(), maxIndexedPositionLength)
|
||||
< inner.getPositionLength())));
|
||||
|
||||
assertEmptyStream(debug + " should not have any super-phrases where super.len <= phrase.len or "
|
||||
+ maxIndexedPositionLength + " < super.len",
|
||||
phrase.getIndexedSuperPhrases().stream().filter
|
||||
(outer -> (outer.getPositionLength() <= phrase.getPositionLength() ||
|
||||
maxIndexedPositionLength < outer.getPositionLength())));
|
||||
}
|
||||
}
|
||||
|
||||
public void testWhiteboxStats() throws Exception {
|
||||
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
|
||||
assertNotNull(analysisField);
|
||||
final String input = "BROWN fox lAzY dog xxxyyyzzz";
|
||||
|
||||
// a function we'll re-use on phrases generated from the above input
|
||||
// the multiplier let's us simulate multiple shards returning the same values
|
||||
BiConsumer<Integer,List<Phrase>> assertions = (mult, phrases) -> {
|
||||
final Phrase brown_fox = phrases.get(1);
|
||||
assertEquals("BROWN fox", brown_fox.getSubSequence());
|
||||
|
||||
assertEquals(mult * 1, brown_fox.getTTF("multigrams_title"));
|
||||
assertEquals(mult * 1, brown_fox.getDocFreq("multigrams_title"));
|
||||
assertEquals(mult * 1, brown_fox.getConjunctionDocCount("multigrams_title"));
|
||||
|
||||
assertEquals(mult * 3, brown_fox.getTTF("multigrams_body"));
|
||||
assertEquals(mult * 2, brown_fox.getDocFreq("multigrams_body"));
|
||||
assertEquals(mult * 2, brown_fox.getConjunctionDocCount("multigrams_body"));
|
||||
|
||||
final Phrase fox_lazy = phrases.get(6);
|
||||
assertEquals("fox lAzY", fox_lazy.getSubSequence());
|
||||
|
||||
assertEquals(mult * 0, fox_lazy.getTTF("multigrams_title"));
|
||||
assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_title"));
|
||||
assertEquals(mult * 1, fox_lazy.getConjunctionDocCount("multigrams_title"));
|
||||
|
||||
assertEquals(mult * 0, fox_lazy.getTTF("multigrams_body"));
|
||||
assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_body"));
|
||||
assertEquals(mult * 2, fox_lazy.getConjunctionDocCount("multigrams_body"));
|
||||
|
||||
final Phrase bfld = phrases.get(3);
|
||||
assertEquals("BROWN fox lAzY dog", bfld.getSubSequence());
|
||||
|
||||
expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_title"); });
|
||||
expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_title"); });
|
||||
assertEquals(mult * 0, bfld.getConjunctionDocCount("multigrams_title"));
|
||||
|
||||
expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_body"); });
|
||||
expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_body"); });
|
||||
assertEquals(mult * 1, bfld.getConjunctionDocCount("multigrams_body"));
|
||||
|
||||
final Phrase xyz = phrases.get(phrases.size()-1);
|
||||
|
||||
assertEquals("xxxyyyzzz", xyz.getSubSequence());
|
||||
assertEquals(mult * 0, xyz.getTTF("multigrams_title"));
|
||||
assertEquals(mult * 0, xyz.getDocFreq("multigrams_title"));
|
||||
assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_title"));
|
||||
|
||||
assertEquals(mult * 0, xyz.getTTF("multigrams_body"));
|
||||
assertEquals(mult * 0, xyz.getDocFreq("multigrams_body"));
|
||||
assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_body"));
|
||||
return;
|
||||
};
|
||||
|
||||
|
||||
final List<Phrase> phrasesLocal = Phrase.extractPhrases(input, analysisField, 3, 7);
|
||||
|
||||
// freshly parsed phrases, w/o any stats populated, all the stats should be 0
|
||||
assertions.accept(0, phrasesLocal);
|
||||
|
||||
// If we populate with our index stats, we should get the basic values in our BiConsumer
|
||||
try (SolrQueryRequest req = req()) {
|
||||
Phrase.populateStats(phrasesLocal, Arrays.asList("multigrams_body","multigrams_title"),
|
||||
req.getSearcher());
|
||||
}
|
||||
assertions.accept(1, phrasesLocal);
|
||||
|
||||
// likewise, if we create a new freshly parsed set of phrases, and "merge" in the previous index stats
|
||||
// (ie: merge results from one shard) we should get the same results
|
||||
final List<Phrase> phrasesMerged = Phrase.extractPhrases(input, analysisField, 3, 7);
|
||||
Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
|
||||
assertions.accept(1, phrasesMerged);
|
||||
|
||||
// if we merge in a second copy of the same results (ie: two identical shards)
|
||||
// our results should be double what we had before
|
||||
Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
|
||||
assertions.accept(2, phrasesMerged);
|
||||
|
||||
}
|
||||
|
||||
public void testWhiteboxScores() throws Exception {
|
||||
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
|
||||
assertNotNull(analysisField);
|
||||
final Map<String,Double> fieldWeights = new TreeMap<>();
|
||||
fieldWeights.put("multigrams_title", 1.0D);
|
||||
fieldWeights.put("multigrams_body", 0.0D); // NOTE: 0 weighting should only affect total score
|
||||
|
||||
final String input = "xxxyyyzzz BROWN fox why are we lAzY";
|
||||
final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
|
||||
try (SolrQueryRequest req = req()) {
|
||||
Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
|
||||
}
|
||||
Phrase.populateScores(phrases, fieldWeights, 3, 7);
|
||||
|
||||
// do some basic sanity checks of the field & total scores...
|
||||
|
||||
for (Phrase xyz : phrases.subList(0, 7)) {
|
||||
// first 7 all start with xyz which isn't in index (in either field) so all scores should be -1
|
||||
assertEquals(xyz.toString(), -1.0D, xyz.getTotalScore(), 0.0D);
|
||||
assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_title"), 0.0D);
|
||||
assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_body"), 0.0D);
|
||||
}
|
||||
|
||||
// any individual terms (past xyz) should score 0.0 because they are all actually in the index
|
||||
// (in both fields)
|
||||
for (Phrase term : phrases.subList(7, phrases.size()).stream().filter
|
||||
((p -> 1 == p.getPositionLength())).collect(Collectors.toList())) {
|
||||
|
||||
assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_title"), 0.0D);
|
||||
assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_body"), 0.0D);
|
||||
assertEquals(term.toString(), 0.0D, term.getTotalScore(), 0.0D);
|
||||
}
|
||||
|
||||
// "brown fox" should score positively in both fields, and overall...
|
||||
final Phrase brown_fox = phrases.get(8);
|
||||
assertEquals("BROWN fox", brown_fox.getSubSequence());
|
||||
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
|
||||
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_body"), greaterThan(0.0D) );
|
||||
assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
|
||||
|
||||
// "we lazy" does appear in a title value, but should score poorly given how often the terms
|
||||
// are used in other contexts, and should score -1 against body -- but because of our weights,
|
||||
// that shouldn't bring down the total
|
||||
final Phrase we_lazy = phrases.get(phrases.size()-2);
|
||||
assertEquals("we lAzY", we_lazy.getSubSequence());
|
||||
assertEquals(we_lazy.toString(), -1.0D, we_lazy.getFieldScore("multigrams_body"), 0.0D);
|
||||
assertThat(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), lessThan(0.0D));
|
||||
assertThat(we_lazy.toString(), we_lazy.getTotalScore(), lessThan(0.0D));
|
||||
assertEquals(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), we_lazy.getTotalScore(),
|
||||
0.0D);
|
||||
|
||||
// "why are we lazy" is longer then the max indexed phrase size & appears verbatim in a title value
|
||||
// it should score -1 against body -- but because of our weights, that shouldn't bring down the total
|
||||
final Phrase wawl = phrases.get(phrases.size()-7);
|
||||
assertEquals("why are we lAzY", wawl.getSubSequence());
|
||||
assertEquals(wawl.toString(), -1.0D, wawl.getFieldScore("multigrams_body"), 0.0D);
|
||||
assertThat(wawl.toString(), wawl.getFieldScore("multigrams_title"), greaterThan(0.0D));
|
||||
assertThat(wawl.toString(), wawl.getTotalScore(), greaterThan(0.0D));
|
||||
assertEquals(wawl.toString(), wawl.getFieldScore("multigrams_title"), wawl.getTotalScore(),
|
||||
0.0D);
|
||||
|
||||
// "brown fox why are we" is longer then the max indexed phrase, and none of it's
|
||||
// (longest) sub phrases exists in either field -- so all of it's scores should be -1
|
||||
final Phrase bfwaw = phrases.get(11);
|
||||
assertEquals("BROWN fox why are we", bfwaw.getSubSequence());
|
||||
assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_title"), 0.0D);
|
||||
assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_body"), 0.0D);
|
||||
assertEquals(bfwaw.toString(), -1.0D, bfwaw.getTotalScore(), 0.0D);
|
||||
|
||||
}
|
||||
|
||||
public void testWhiteboxScorcesStopwords() throws Exception {
|
||||
final String input = "why the lazy dog brown fox";
|
||||
final Map<String,Double> fieldWeights = new TreeMap<>();
|
||||
fieldWeights.put("multigrams_title", 1.0D);
|
||||
fieldWeights.put("multigrams_title_stop", 1.0D);
|
||||
|
||||
{ // If our analysisField uses all terms,
|
||||
// be we also generate scores from a field that filters stopwords...
|
||||
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title");
|
||||
assertNotNull(analysisField);
|
||||
|
||||
final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
|
||||
try (SolrQueryRequest req = req()) {
|
||||
Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
|
||||
}
|
||||
Phrase.populateScores(phrases, fieldWeights, 3, 7);
|
||||
|
||||
// phrases that span the stop word should have valid scores from the field that doesn't care
|
||||
// about stop words, but the stopword field should reject them
|
||||
final Phrase why_the_lazy = phrases.get(2);
|
||||
assertEquals("why the lazy", why_the_lazy.getSubSequence());
|
||||
assertThat(why_the_lazy.toString(), why_the_lazy.getFieldScore("multigrams_title"), greaterThan(0.0D) );
|
||||
assertEquals(why_the_lazy.toString(), -1.0D, why_the_lazy.getFieldScore("multigrams_title_stop"), 0.0D);
|
||||
|
||||
final Phrase the_lazy_dog = phrases.get(8);
|
||||
assertEquals("the lazy dog", the_lazy_dog.getSubSequence());
|
||||
assertThat(the_lazy_dog.toString(), the_lazy_dog.getFieldScore("multigrams_title"), greaterThan(0.0D) );
|
||||
assertEquals(the_lazy_dog.toString(), -1.0D, the_lazy_dog.getFieldScore("multigrams_title_stop"), 0.0D);
|
||||
|
||||
// sanity check that good scores are still possible with stopwords
|
||||
// "brown fox" should score positively in both fields, and overall...
|
||||
final Phrase brown_fox = phrases.get(phrases.size()-2);
|
||||
assertEquals("brown fox", brown_fox.getSubSequence());
|
||||
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
|
||||
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
|
||||
assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
|
||||
}
|
||||
|
||||
{ // now flip things: our analysisField filters stopwords,
|
||||
// but we also generates scores from a field that doesn't know about them...
|
||||
//
|
||||
// (NOTE: the parser will still generate _some_ candidate phrases spaning the stop word position,
|
||||
// but not ones that start with the stopword)
|
||||
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title_stop");
|
||||
assertNotNull(analysisField);
|
||||
|
||||
final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
|
||||
try (SolrQueryRequest req = req()) {
|
||||
Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
|
||||
}
|
||||
Phrase.populateScores(phrases, fieldWeights, 3, 7);
|
||||
assertTrue(phrases.toString(), 0 < phrases.size());
|
||||
|
||||
for (Phrase p : phrases) {
|
||||
if (p.getPositionStart() <= 2 && 2 < p.getPositionEnd()) {
|
||||
// phrases that span the stop word should have valid scores from the field that doesn't care
|
||||
// about stop words, but the stopword field should reject them
|
||||
assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title"), 0.0D);
|
||||
assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title_stop"), 0.0D);
|
||||
}
|
||||
}
|
||||
|
||||
// sanity check that good scores are still possible with stopwords
|
||||
// "brown fox" should score positively in both fields, and overall...
|
||||
final Phrase brown_fox = phrases.get(phrases.size()-2);
|
||||
assertEquals("brown fox", brown_fox.getSubSequence());
|
||||
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
|
||||
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
|
||||
assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testExpectedUserErrors() throws Exception {
|
||||
assertQEx("empty field list should error",
|
||||
"must specify a (weighted) list of fields",
|
||||
req("q","foo", "phrases","true",
|
||||
"phrases.fields", " "),
|
||||
ErrorCode.BAD_REQUEST);
|
||||
|
||||
assertQEx("bogus field name should error",
|
||||
"does not exist",
|
||||
req("q","foo", "phrases","true",
|
||||
"phrases.fields", "bogus1 bogus2"),
|
||||
ErrorCode.BAD_REQUEST);
|
||||
|
||||
assertQEx("lack of shingles should cause error",
|
||||
"Unable to determine max position length",
|
||||
req("q","foo", "phrases","true",
|
||||
"phrases.fields", "title"),
|
||||
ErrorCode.BAD_REQUEST);
|
||||
|
||||
assertQEx("analyzer missmatch should cause error",
|
||||
"must have the same fieldType",
|
||||
req("q","foo", "phrases","true",
|
||||
"phrases.fields", "multigrams_title multigrams_title_short"),
|
||||
ErrorCode.BAD_REQUEST);
|
||||
|
||||
assertQEx("analysis field must exist",
|
||||
"does not exist",
|
||||
req("q","foo", "phrases","true",
|
||||
"phrases.analysis.field", "bogus",
|
||||
"phrases.fields", "multigrams_title multigrams_title_short"),
|
||||
ErrorCode.BAD_REQUEST);
|
||||
|
||||
assertQEx("no query param should error",
|
||||
"requires a query string",
|
||||
req("qt", "/phrases",
|
||||
"phrases.fields", "multigrams_title"),
|
||||
ErrorCode.BAD_REQUEST);
|
||||
}
|
||||
|
||||
public void testMaxShingleSizeHelper() throws Exception {
|
||||
IndexSchema schema = h.getCore().getLatestSchema();
|
||||
|
||||
assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
|
||||
(schema.getFieldTypeByName("multigrams_3_7").getIndexAnalyzer()));
|
||||
assertEquals(7, PhrasesIdentificationComponent.getMaxShingleSize
|
||||
(schema.getFieldTypeByName("multigrams_3_7").getQueryAnalyzer()));
|
||||
|
||||
assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
|
||||
(schema.getFieldTypeByName("multigrams_3").getIndexAnalyzer()));
|
||||
assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
|
||||
(schema.getFieldTypeByName("multigrams_3").getQueryAnalyzer()));
|
||||
|
||||
assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
|
||||
(schema.getFieldTypeByName("text").getIndexAnalyzer()));
|
||||
assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
|
||||
(schema.getFieldTypeByName("text").getQueryAnalyzer()));
|
||||
|
||||
}
|
||||
|
||||
public void testSimplePhraseRequest() throws Exception {
|
||||
final String input = " did a Quick brown FOX perniciously jump over the lazy dog";
|
||||
final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}";
|
||||
|
||||
// should get same behavior regardless of wether we use "q" or "phrases.q"
|
||||
for (String p : Arrays.asList("q", "phrases.q")) {
|
||||
// basic request...
|
||||
assertQ(req("qt", HANDLER, p, input)
|
||||
// expect no search results...
|
||||
, "count(//result)=0"
|
||||
|
||||
// just phrase info...
|
||||
, "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
|
||||
, "//lst[@name='phrases']/str[@name='summary'][.='"+expected+"']"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']/lst) = 2"
|
||||
//
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/str[@name='text'][.='the lazy dog']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_start'][.='50']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_end'][.='62']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/double[@name='score'][number(.) > 0]"
|
||||
//
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/str[@name='text'][.='brown FOX']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_start'][.='17']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_end'][.='26']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/double[@name='score'][number(.) > 0]"
|
||||
);
|
||||
|
||||
// empty input, empty phrases (and no error)...
|
||||
assertQ(req("qt", HANDLER, p, "")
|
||||
// expect no search results...
|
||||
, "count(//result)=0"
|
||||
// just empty phrase info for our empty input...
|
||||
, "//lst[@name='phrases']/str[@name='input'][.='']"
|
||||
, "//lst[@name='phrases']/str[@name='summary'][.='']"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']) = 1"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public void testSimpleSearchRequests() throws Exception {
|
||||
final String input = "\"brown fox\"";
|
||||
|
||||
assertQ(req("q", input)
|
||||
// basic search should have worked...
|
||||
, "//result[@numFound='2']"
|
||||
, "//result/doc/str[@name='id'][.='42']"
|
||||
, "//result/doc/str[@name='id'][.='43']"
|
||||
// and phrases should not be returned since they weren't requested...
|
||||
, "0=count(//lst[@name='phrases'])"
|
||||
);
|
||||
|
||||
assertQ(req("phrases", "false", "q", input)
|
||||
// basic search should have worked...
|
||||
, "//result[@numFound='2']"
|
||||
, "//result/doc/str[@name='id'][.='42']"
|
||||
, "//result/doc/str[@name='id'][.='43']"
|
||||
// and phrases should not be returned since they were explicitly disabled...
|
||||
, "0=count(//lst[@name='phrases'])"
|
||||
);
|
||||
|
||||
// with input this short, all of these permutations of requests should produce the same output...
|
||||
for (SolrQueryRequest req : Arrays.asList
|
||||
( // simple, using 3/7 defaults
|
||||
req("phrases","true", "q", input),
|
||||
|
||||
// simple, using just the 3/3 'short' fields
|
||||
req("phrases","true", "q", input,
|
||||
"phrases.fields", "multigrams_body_short multigrams_title_short^2"),
|
||||
|
||||
// diff analysers, but explicit override using 3/3 "short" field...
|
||||
req("phrases","true", "q", input,
|
||||
"phrases.fields", "multigrams_body multigrams_title_short^2",
|
||||
"phrases.analysis.field", "multigrams_title_short"))) {
|
||||
assertQ(req
|
||||
// basic search should have worked...
|
||||
, "//result[@numFound='2']"
|
||||
, "//result/doc/str[@name='id'][.='42']"
|
||||
, "//result/doc/str[@name='id'][.='43']"
|
||||
|
||||
// and we should have gotten phrase info...
|
||||
, "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
|
||||
, "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
|
||||
);
|
||||
}
|
||||
|
||||
// override the query string to get different phrases
|
||||
assertQ(req("phrases","true", "q", "*:*", "phrases.q", input)
|
||||
// basic search should have found all docs...
|
||||
, "//result[@numFound='4']"
|
||||
// and we should have gotten phrase info for our alternative q string...
|
||||
, "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
|
||||
, "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
|
||||
, "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
|
||||
);
|
||||
|
||||
// empty input, empty phrases (but no error)
|
||||
assertQ(req("phrases","true", "q", "*:*", "phrases.q", "")
|
||||
// basic search should have found all docs...
|
||||
, "//result[@numFound='4']"
|
||||
// and we should have gotten (empty) phrase info for our alternative q string...
|
||||
, "//lst[@name='phrases']/str[@name='input'][.='']"
|
||||
, "//lst[@name='phrases']/str[@name='summary'][.='']"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']) = 1"
|
||||
, "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
|
||||
);
|
||||
}
|
||||
|
||||
public void testGreyboxShardSearchRequests() throws Exception {
|
||||
final String input = "quick brown fox ran";
|
||||
|
||||
final String phrase_xpath = "//lst[@name='phrases']";
|
||||
final String all_phrase_xpath = phrase_xpath + "/arr[@name='_all']";
|
||||
|
||||
// phrases requested, and correct request stage / shard purpose ...
|
||||
assertQ(req("q", input,
|
||||
"phrases","true",
|
||||
ShardParams.IS_SHARD, "true",
|
||||
ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
|
||||
|
||||
// this shard request should have caused stats to be returned about all phrases...
|
||||
, "10=count("+ all_phrase_xpath +"/lst)"
|
||||
// "quick" ...
|
||||
, all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_body'][.='1']"
|
||||
, all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_title'][.='0']"
|
||||
// ...
|
||||
// "brown fox"
|
||||
, all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_body'][.='3']"
|
||||
, all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_title'][.='1']"
|
||||
, all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_body'][.='2']"
|
||||
, all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_title'][.='1']"
|
||||
, all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_body'][.='2']"
|
||||
, all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_title'][.='1']"
|
||||
|
||||
// but no computed "scores"...
|
||||
, "0=count("+phrase_xpath+"//*[@name='score'])"
|
||||
);
|
||||
|
||||
// phrases requested, but incorrect request stage / shard purpose ...
|
||||
assertQ(req("q", input,
|
||||
"phrases","true",
|
||||
ShardParams.IS_SHARD, "true",
|
||||
ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
|
||||
, "0=count("+ phrase_xpath +"/lst)");
|
||||
|
||||
// phrases disabled, regardless of request stage / shard purpose ...
|
||||
assertTrue("sanity check failed, stage was modified in code w/o updating test",
|
||||
PhrasesIdentificationComponent.SHARD_PURPOSE != ShardRequest.PURPOSE_GET_FIELDS);
|
||||
assertQ(req("q", input,
|
||||
"phrases","false",
|
||||
ShardParams.IS_SHARD, "true",
|
||||
ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
|
||||
, "0=count("+ phrase_xpath +"/lst)");
|
||||
assertQ(req("q", input,
|
||||
"phrases","false",
|
||||
ShardParams.IS_SHARD, "true",
|
||||
ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
|
||||
, "0=count("+ phrase_xpath +"/lst)");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Trivial Helper method that collects & compares to an empty List so
|
||||
* the assertion shows the unexpected stream elements
|
||||
*/
|
||||
public <T> void assertEmptyStream(final String msg, final Stream<? extends T> stream) {
|
||||
assertEquals(msg,
|
||||
Collections.emptyList(),
|
||||
stream.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
/** helper, docs for future junit/hamcrest seems to have something similar */
|
||||
public static Matcher lessThan(double expected) {
|
||||
return new BaseMatcher() {
|
||||
@Override public boolean matches(Object actual) {
|
||||
return ((Double)actual).compareTo(expected) < 0;
|
||||
}
|
||||
@Override public void describeTo(Description d) {
|
||||
d.appendText("should be less than " + expected);
|
||||
}
|
||||
};
|
||||
}
|
||||
/** helper, docs for future junit/hamcrest seems to have something similar */
|
||||
public static Matcher greaterThan(double expected) {
|
||||
return new BaseMatcher() {
|
||||
@Override public boolean matches(Object actual) {
|
||||
return 0 < ((Double)actual).compareTo(expected);
|
||||
}
|
||||
@Override public void describeTo(Description d) {
|
||||
d.appendText("should be greater than " + expected);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue