mirror of https://github.com/apache/lucene.git
Fixes for SOLR-2921 (making more components MultiTermAware)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303939 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
55e6bd929d
commit
edfac283ff
|
@ -21,10 +21,14 @@ import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/** Factory for {@link ICUFoldingFilter} */
|
/** Factory for {@link ICUFoldingFilter} */
|
||||||
public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
|
public class ICUFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new ICUFoldingFilter(input);
|
return new ICUFoldingFilter(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ import com.ibm.icu.text.UnicodeSet;
|
||||||
* @see Normalizer2
|
* @see Normalizer2
|
||||||
* @see FilteredNormalizer2
|
* @see FilteredNormalizer2
|
||||||
*/
|
*/
|
||||||
public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
|
public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
private Normalizer2 normalizer;
|
private Normalizer2 normalizer;
|
||||||
|
|
||||||
// TODO: support custom normalization
|
// TODO: support custom normalization
|
||||||
|
@ -78,4 +78,8 @@ public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new ICUNormalizer2Filter(input, normalizer);
|
return new ICUNormalizer2Filter(input, normalizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<schema name="test" version="1.0">
|
||||||
|
<types>
|
||||||
|
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
|
||||||
|
|
||||||
|
|
||||||
|
<fieldType name="text_icufolding" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ICUFoldingFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_icunormalizer2" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ICUNormalizer2FilterFactory" name="nfkc_cf" mode="compose"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
</types>
|
||||||
|
|
||||||
|
<fields>
|
||||||
|
<field name="id" type="string" indexed="true" stored="true" required="true"/>
|
||||||
|
<field name="content_icufolding" type="text_icufolding" indexed="true" stored="true"/>
|
||||||
|
<field name="content_icunormalizer2" type="text_icunormalizer2" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
</fields>
|
||||||
|
|
||||||
|
<defaultSearchField>id</defaultSearchField>
|
||||||
|
<uniqueKey>id</uniqueKey>
|
||||||
|
|
||||||
|
</schema>
|
|
@ -0,0 +1,77 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestFoldingMultitermExtrasQuery extends SolrTestCaseJ4 {
|
||||||
|
|
||||||
|
public String getCoreName() {
|
||||||
|
return "basic";
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeTests() throws Exception {
|
||||||
|
initCore("solrconfig-icucollate.xml","schema-folding-extra.xml", "analysis-extras/solr");
|
||||||
|
IndexWriter iw;
|
||||||
|
|
||||||
|
int idx = 1;
|
||||||
|
// ICUFoldingFilterFactory
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "BadMagicICUFolding"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Ruß"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ΜΆΪΟΣ"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Μάϊος"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "résumé"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "re\u0301sume\u0301"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ELİF"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "eli\u0307f"));
|
||||||
|
|
||||||
|
// ICUNormalizer2FilterFactory
|
||||||
|
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "BadMagicICUFolding"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Ruß"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ΜΆΪΟΣ"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Μάϊος"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "résumé"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "re\u0301sume\u0301"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ELİF"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "eli\u0307f"));
|
||||||
|
|
||||||
|
assertU(optimize());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testICUFolding() {
|
||||||
|
assertQ(req("q", "content_icufolding:BadMagicicuFold*"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_icufolding:rU*"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_icufolding:Re*Me"), "//result[@numFound='2']");
|
||||||
|
assertQ(req("q", "content_icufolding:RE\u0301su*"), "//result[@numFound='2']");
|
||||||
|
assertQ(req("q", "content_icufolding:El*"), "//result[@numFound='2']");
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testICUNormalizer2() {
|
||||||
|
assertQ(req("q", "content_icunormalizer2:BadMagicicuFold*"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_icunormalizer2:RU*"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_icunormalizer2:Μάϊ*"), "//result[@numFound='2']");
|
||||||
|
assertQ(req("q", "content_icunormalizer2:re\u0301Su*"), "//result[@numFound='2']");
|
||||||
|
assertQ(req("q", "content_icunormalizer2:eL*"), "//result[@numFound='2']");
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -37,7 +36,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent
|
||||||
{
|
{
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -53,5 +52,9 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
||||||
public GreekLowerCaseFilter create(TokenStream in) {
|
public GreekLowerCaseFilter create(TokenStream in) {
|
||||||
return new GreekLowerCaseFilter(luceneMatchVersion, in);
|
return new GreekLowerCaseFilter(luceneMatchVersion, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,8 +31,13 @@ import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new TurkishLowerCaseFilter(input);
|
return new TurkishLowerCaseFilter(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -149,6 +149,28 @@
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
|
||||||
|
<fieldType name="text_greek" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.GreekLowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_turkish" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_russian" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
|
||||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
<fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
<fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
<fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
<fieldType name="long" class="solr.TrieLongField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
<fieldType name="long" class="solr.TrieLongField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
@ -178,6 +200,9 @@
|
||||||
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
|
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
|
||||||
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
|
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
|
||||||
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
|
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
|
||||||
|
<field name="content_greek" type="text_greek" indexed="true" stored="true"/>
|
||||||
|
<field name="content_turkish" type="text_turkish" indexed="true" stored="true"/>
|
||||||
|
<field name="content_russian" type="text_russian" indexed="true" stored="true"/>
|
||||||
|
|
||||||
<dynamicField name="*_straight" type="text_straight" indexed="true" stored="true"/>
|
<dynamicField name="*_straight" type="text_straight" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_lower" type="text_lower" indexed="true" stored="true"/>
|
<dynamicField name="*_lower" type="text_lower" indexed="true" stored="true"/>
|
||||||
|
|
|
@ -67,6 +67,25 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
"content_keyword", docs[i]
|
"content_keyword", docs[i]
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
// Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various
|
||||||
|
// special filters
|
||||||
|
int idx = docs.length;
|
||||||
|
// Greek
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ"));
|
||||||
|
|
||||||
|
// Turkish
|
||||||
|
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir"));
|
||||||
|
|
||||||
|
|
||||||
|
// Russian normalization
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе"));
|
||||||
|
assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе"));
|
||||||
|
|
||||||
assertU(optimize());
|
assertU(optimize());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -272,4 +291,17 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
resetExceptionIgnores();
|
resetExceptionIgnores();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@Test
|
||||||
|
public void testGreek() {
|
||||||
|
assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']");
|
||||||
|
assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']");
|
||||||
|
assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']");
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testRussian() {
|
||||||
|
assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']");
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue