SOLR-2015: add boolean attribute autoGeneratePhraseQueries to TextField

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@979049 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2010-07-25 15:13:05 +00:00
parent a5430edf83
commit da7655c72a
6 changed files with 116 additions and 7 deletions

View File

@ -203,6 +203,15 @@ New Features
http://wiki.apache.org/solr/SpatialSearch and the example. Refactored some items in Lucene spatial.
Removed SpatialTileField as the underlying CartesianTier is broken beyond repair and is going to be moved. (gsingers)
* SOLR-2015: Add a boolean attribute autoGeneratePhraseQueries to TextField.
autoGeneratePhraseQueries="true" (the default) causes the query parser to
generate phrase queries if multiple tokens are generated from a single
non-quoted analysis string. For example WordDelimiterFilter splitting text:pdp-11
will cause the parser to generate text:"pdp 11" rather than (text:PDP OR text:11).
Note that autoGeneratePhraseQueries="true" tends to not work well for non whitespace
delimited languages. (yonik)
Optimizations
----------------------

View File

@ -213,8 +213,12 @@
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
to generate text:"pdp 11" rather than (text:PDP OR text:11).
NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time

View File

@ -46,13 +46,21 @@ import java.io.StringReader;
* @version $Id$
*/
public class TextField extends FieldType {
protected boolean autoGeneratePhraseQueries = true;
protected void init(IndexSchema schema, Map<String,String> args) {
properties |= TOKENIZED;
if (schema.getVersion()> 1.1f) properties &= ~OMIT_TF_POSITIONS;
String autoGeneratePhraseQueriesStr = args.remove("autoGeneratePhraseQueries");
if (autoGeneratePhraseQueriesStr != null)
autoGeneratePhraseQueries = Boolean.parseBoolean(autoGeneratePhraseQueriesStr);
super.init(schema, args);
}
public boolean getAutoGeneratePhraseQueries() {
return autoGeneratePhraseQueries;
}
public SortField getSortField(SchemaField field, boolean reverse) {
return getStringSort(field, reverse);
}

View File

@ -142,11 +142,15 @@ public class SolrQueryParser extends QueryParser {
return parser.subQuery(queryText, null).getQuery();
}
}
//Intercept poly fields, as they get expanded by default to an OR clause of
SchemaField sf = schema.getField(field);
//TODO: is there anyway to avoid this instance of check?
if (sf != null&& !(sf.getType() instanceof TextField)){//we have a poly field, deal with it specially by delegating to the FieldType
return sf.getType().getFieldQuery(parser, sf, queryText);
SchemaField sf = schema.getFieldOrNull(field);
if (sf != null) {
FieldType ft = sf.getType();
// delegate to type for everything except TextField
if (ft instanceof TextField) {
return super.getFieldQuery(field, queryText, quoted || ((TextField)ft).getAutoGeneratePhraseQueries());
} else {
return sf.getType().getFieldQuery(parser, sf, queryText);
}
}
// default to a normal field query

View File

@ -0,0 +1,53 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.After;
import org.junit.Test;
public class TestSolrQueryParser extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema12.xml");
createIndex();
}
public static void createIndex() {
String v;
v="how now brown cow";
assertU(adoc("id","1", "text",v, "text_np",v));
v="now cow";
assertU(adoc("id","2", "text",v, "text_np",v));
assertU(commit());
}
@Test
public void testPhrase() {
// should generate a phrase of "now cow" and match only one doc
assertQ(req("q","text:now-cow", "indent","true")
,"//*[@numFound='1']"
);
// should generate a query of (now OR cow) and match both docs
assertQ(req("q","text_np:now-cow", "indent","true")
,"//*[@numFound='2']"
);
}
}

View File

@ -145,6 +145,35 @@
</fieldType>
<!-- field type that doesn't generate phrases from unquoted multiple tokens per analysis unit -->
<fieldType name="text_np" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false" >
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldtype name="nametext" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
@ -403,6 +432,8 @@
<field name="weight" type="float" indexed="true" stored="true"/>
<field name="bday" type="date" indexed="true" stored="true"/>
<field name="text_np" type="text_np" indexed="true" stored="false"/>
<field name="title_stemmed" type="text" indexed="true" stored="false"/>
<field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>