Fixes for SOLR-2921 (making more components MultiTermAware)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303939 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erick Erickson 2012-03-22 18:03:21 +00:00
parent 55e6bd929d
commit edfac283ff
8 changed files with 204 additions and 5 deletions

View File

@ -21,10 +21,14 @@ import org.apache.lucene.analysis.icu.ICUFoldingFilter;
*/
/** Factory for {@link ICUFoldingFilter} */
public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
public class ICUFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
@Override
public TokenStream create(TokenStream input) {
return new ICUFoldingFilter(input);
}
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -44,7 +44,7 @@ import com.ibm.icu.text.UnicodeSet;
* @see Normalizer2
* @see FilteredNormalizer2
*/
public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
private Normalizer2 normalizer;
// TODO: support custom normalization
@ -78,4 +78,8 @@ public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new ICUNormalizer2Filter(input, normalizer);
}
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -0,0 +1,49 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="test" version="1.0">
<types>
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
<fieldType name="text_icufolding" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ICUFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_icunormalizer2" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ICUNormalizer2FilterFactory" name="nfkc_cf" mode="compose"/>
</analyzer>
</fieldType>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="content_icufolding" type="text_icufolding" indexed="true" stored="true"/>
<field name="content_icunormalizer2" type="text_icunormalizer2" indexed="true" stored="true"/>
</fields>
<defaultSearchField>id</defaultSearchField>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -0,0 +1,77 @@
package org.apache.solr.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexWriter;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestFoldingMultitermExtrasQuery extends SolrTestCaseJ4 {
public String getCoreName() {
return "basic";
}
@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig-icucollate.xml","schema-folding-extra.xml", "analysis-extras/solr");
IndexWriter iw;
int idx = 1;
// ICUFoldingFilterFactory
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "BadMagicICUFolding"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Ruß"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ΜΆΪΟΣ"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Μάϊος"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "résumé"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "re\u0301sume\u0301"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ELİF"));
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "eli\u0307f"));
// ICUNormalizer2FilterFactory
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "BadMagicICUFolding"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Ruß"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ΜΆΪΟΣ"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Μάϊος"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "résumé"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "re\u0301sume\u0301"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ELİF"));
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "eli\u0307f"));
assertU(optimize());
}
@Test
public void testICUFolding() {
assertQ(req("q", "content_icufolding:BadMagicicuFold*"), "//result[@numFound='1']");
assertQ(req("q", "content_icufolding:rU*"), "//result[@numFound='1']");
assertQ(req("q", "content_icufolding:Re*Me"), "//result[@numFound='2']");
assertQ(req("q", "content_icufolding:RE\u0301su*"), "//result[@numFound='2']");
assertQ(req("q", "content_icufolding:El*"), "//result[@numFound='2']");
}
@Test
public void testICUNormalizer2() {
assertQ(req("q", "content_icunormalizer2:BadMagicicuFold*"), "//result[@numFound='1']");
assertQ(req("q", "content_icunormalizer2:RU*"), "//result[@numFound='1']");
assertQ(req("q", "content_icunormalizer2:Μάϊ*"), "//result[@numFound='2']");
assertQ(req("q", "content_icunormalizer2:re\u0301Su*"), "//result[@numFound='2']");
assertQ(req("q", "content_icunormalizer2:eL*"), "//result[@numFound='2']");
}
}

View File

@ -1,4 +1,3 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -37,7 +36,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
* &lt;/fieldType&gt;</pre>
*
*/
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent
{
@Override
@ -53,5 +52,9 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
public GreekLowerCaseFilter create(TokenStream in) {
return new GreekLowerCaseFilter(luceneMatchVersion, in);
}
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -31,8 +31,13 @@ import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
* &lt;/fieldType&gt;</pre>
*
*/
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
public TokenStream create(TokenStream input) {
return new TurkishLowerCaseFilter(input);
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -149,6 +149,28 @@
</fieldType>
<fieldType name="text_greek" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_turkish" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.TurkishLowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_russian" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
@ -178,6 +200,9 @@
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
<field name="content_greek" type="text_greek" indexed="true" stored="true"/>
<field name="content_turkish" type="text_turkish" indexed="true" stored="true"/>
<field name="content_russian" type="text_russian" indexed="true" stored="true"/>
<dynamicField name="*_straight" type="text_straight" indexed="true" stored="true"/>
<dynamicField name="*_lower" type="text_lower" indexed="true" stored="true"/>

View File

@ -67,6 +67,25 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
"content_keyword", docs[i]
));
}
// Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various
// special filters
int idx = docs.length;
// Greek
assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος"));
assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ"));
// Turkish
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL"));
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA"));
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir"));
// Russian normalization
assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной"));
assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе"));
assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе"));
assertU(optimize());
}
@ -272,4 +291,17 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
resetExceptionIgnores();
}
}
@Test
public void testGreek() {
assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']");
assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']");
assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']");
}
@Test
public void testRussian() {
assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']");
}
}