From edfac283ff8f77f40c59090c93a615bfb543f19a Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Thu, 22 Mar 2012 18:03:21 +0000 Subject: [PATCH] Fixes for SOLR-2921 (making more components MultiTermAware) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303939 13f79535-47bb-0310-9956-ffa450edef68 --- .../analysis/ICUFoldingFilterFactory.java | 6 +- .../analysis/ICUNormalizer2FilterFactory.java | 6 +- .../solr/conf/schema-folding-extra.xml | 49 ++++++++++++ .../TestFoldingMultitermExtrasQuery.java | 77 +++++++++++++++++++ .../analysis/GreekLowerCaseFilterFactory.java | 7 +- .../TurkishLowerCaseFilterFactory.java | 7 +- .../test-files/solr/conf/schema-folding.xml | 25 ++++++ .../search/TestFoldingMultitermQuery.java | 32 ++++++++ 8 files changed, 204 insertions(+), 5 deletions(-) create mode 100644 solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-folding-extra.xml create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java index c0aa1fbd186..ebedc1dd69e 100644 --- a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java @@ -21,10 +21,14 @@ import org.apache.lucene.analysis.icu.ICUFoldingFilter; */ /** Factory for {@link ICUFoldingFilter} */ -public class ICUFoldingFilterFactory extends BaseTokenFilterFactory { +public class ICUFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent { @Override public TokenStream create(TokenStream input) { return new ICUFoldingFilter(input); } + + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java index 860a5c53d9a..63b465723ee 100644 --- a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java @@ -44,7 +44,7 @@ import com.ibm.icu.text.UnicodeSet; * @see Normalizer2 * @see FilteredNormalizer2 */ -public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory { +public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent { private Normalizer2 normalizer; // TODO: support custom normalization @@ -78,4 +78,8 @@ public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new ICUNormalizer2Filter(input, normalizer); } + + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-folding-extra.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-folding-extra.xml new file mode 100644 index 00000000000..e903557cfe4 --- /dev/null +++ b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-folding-extra.xml @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + id + + diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java new file mode 100644 index 00000000000..1b8b362407d --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java @@ -0,0 +1,77 @@ +package org.apache.solr.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexWriter; +import org.apache.solr.SolrTestCaseJ4; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestFoldingMultitermExtrasQuery extends SolrTestCaseJ4 { + + public String getCoreName() { + return "basic"; + } + + @BeforeClass + public static void beforeTests() throws Exception { + initCore("solrconfig-icucollate.xml","schema-folding-extra.xml", "analysis-extras/solr"); + IndexWriter iw; + + int idx = 1; + // ICUFoldingFilterFactory + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "BadMagicICUFolding")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Ruß")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ΜΆΪΟΣ")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Μάϊος")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "résumé")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "re\u0301sume\u0301")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ELİF")); + assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "eli\u0307f")); + + // ICUNormalizer2FilterFactory + + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "BadMagicICUFolding")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Ruß")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ΜΆΪΟΣ")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Μάϊος")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "résumé")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "re\u0301sume\u0301")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ELİF")); + assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "eli\u0307f")); + + assertU(optimize()); + } + + @Test + public void testICUFolding() { + assertQ(req("q", "content_icufolding:BadMagicicuFold*"), "//result[@numFound='1']"); + assertQ(req("q", "content_icufolding:rU*"), "//result[@numFound='1']"); + assertQ(req("q", "content_icufolding:Re*Me"), "//result[@numFound='2']"); + assertQ(req("q", "content_icufolding:RE\u0301su*"), "//result[@numFound='2']"); + assertQ(req("q", "content_icufolding:El*"), "//result[@numFound='2']"); + } + @Test + public void testICUNormalizer2() { + assertQ(req("q", "content_icunormalizer2:BadMagicicuFold*"), "//result[@numFound='1']"); + assertQ(req("q", "content_icunormalizer2:RU*"), "//result[@numFound='1']"); + assertQ(req("q", "content_icunormalizer2:Μάϊ*"), "//result[@numFound='2']"); + assertQ(req("q", "content_icunormalizer2:re\u0301Su*"), "//result[@numFound='2']"); + assertQ(req("q", "content_icunormalizer2:eL*"), "//result[@numFound='2']"); + } +} diff --git a/solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java index 2742260ef3e..6952d6e657c 100644 --- a/solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java @@ -1,4 +1,3 @@ - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -37,7 +36,7 @@ import org.apache.solr.common.SolrException.ErrorCode; * </fieldType> * */ -public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory +public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent { @Override @@ -53,5 +52,9 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory public GreekLowerCaseFilter create(TokenStream in) { return new GreekLowerCaseFilter(luceneMatchVersion, in); } + + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java index 153ad026b2a..b360434b3f3 100644 --- a/solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java @@ -31,8 +31,13 @@ import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; * </fieldType> * */ -public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory { +public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent { public TokenStream create(TokenStream input) { return new TurkishLowerCaseFilter(input); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/core/src/test-files/solr/conf/schema-folding.xml b/solr/core/src/test-files/solr/conf/schema-folding.xml index 0e77b8b59a6..7992c9890dd 100644 --- a/solr/core/src/test-files/solr/conf/schema-folding.xml +++ b/solr/core/src/test-files/solr/conf/schema-folding.xml @@ -149,6 +149,28 @@ + + + + + + + + + + + + + + + + + + + + + + @@ -178,6 +200,9 @@ + + + diff --git a/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java b/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java index 888f6047291..e0b653c2828 100644 --- a/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java +++ b/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java @@ -67,6 +67,25 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { "content_keyword", docs[i] )); } + // Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various + // special filters + int idx = docs.length; + // Greek + assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος")); + assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ")); + + // Turkish + + assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL")); + assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA")); + assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir")); + + + // Russian normalization + assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной")); + assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе")); + assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе")); + assertU(optimize()); } @@ -272,4 +291,17 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { resetExceptionIgnores(); } } + @Test + public void testGreek() { + assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']"); + assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']"); + assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']"); + } + @Test + public void testRussian() { + assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']"); + assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']"); + assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']"); + assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']"); + } } \ No newline at end of file