SOLR-2919: Localized rangequery support for icu/jdk collation filters, parametric tailoring for icu collators

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1207070 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-11-28 11:03:24 +00:00
parent d949201403
commit 4dd6b5fbed
12 changed files with 722 additions and 6 deletions

View File

@ -395,6 +395,8 @@ New Features
"multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
* SOLR-2919: Added support for localized range queries when the analysis chain uses
CollationKeyFilter or ICUCollationKeyFilter. (Michael Sokolov, rmuir)
Bug Fixes
----------------------

View File

@ -17,6 +17,13 @@ $Id$
the Solr 3.x ICUCollationKeyFilterFactory, and also supports
Locale-sensitive range queries. (rmuir)
================== 3.6.0 ==================
* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.
These can be used to customize range query/sort behavior, for example to
support numeric collation, ignore punctuation/whitespace, ignore accents but
not case, control whether upper/lowercase values are sorted first, etc. (rmuir)
================== 3.5.0 ==================
(No Changes)

View File

@ -53,6 +53,15 @@ import com.ibm.icu.util.ULocale;
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
* <p>
* Expert options:
* <ul>
* <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
* <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
* <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
* <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
* <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
* </ul>
*
* @see Collator
* @see ULocale
@ -60,7 +69,7 @@ import com.ibm.icu.util.ULocale;
* @deprecated use {@link org.apache.solr.schema.ICUCollationField} instead.
*/
@Deprecated
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
@ -68,6 +77,12 @@ public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory impleme
String localeID = args.get("locale");
String strength = args.get("strength");
String decomposition = args.get("decomposition");
String alternate = args.get("alternate");
String caseLevel = args.get("caseLevel");
String caseFirst = args.get("caseFirst");
String numeric = args.get("numeric");
String variableTop = args.get("variableTop");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@ -110,6 +125,36 @@ public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory impleme
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
if (alternate != null) {
if (alternate.equalsIgnoreCase("shifted")) {
rbc.setAlternateHandlingShifted(true);
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
rbc.setAlternateHandlingShifted(false);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
}
}
if (caseLevel != null) {
rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
}
if (caseFirst != null) {
if (caseFirst.equalsIgnoreCase("lower")) {
rbc.setLowerCaseFirst(true);
} else if (caseFirst.equalsIgnoreCase("upper")) {
rbc.setUpperCaseFirst(true);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
}
}
if (numeric != null) {
rbc.setNumericCollation(Boolean.parseBoolean(numeric));
}
if (variableTop != null) {
rbc.setVariableTop(variableTop);
}
}
public TokenStream create(TokenStream input) {
@ -141,4 +186,9 @@ public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory impleme
IOUtils.closeQuietly(input);
}
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -66,7 +66,16 @@ import com.ibm.icu.util.ULocale;
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
*
* <p>
* Expert options:
* <ul>
* <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
* <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
* <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
* <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
* <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
* </ul>
*
* @see Collator
* @see ULocale
* @see RuleBasedCollator
@ -90,6 +99,12 @@ public class ICUCollationField extends FieldType {
String strength = args.remove("strength");
String decomposition = args.remove("decomposition");
String alternate = args.remove("alternate");
String caseLevel = args.remove("caseLevel");
String caseFirst = args.remove("caseFirst");
String numeric = args.remove("numeric");
String variableTop = args.remove("variableTop");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@ -133,6 +148,37 @@ public class ICUCollationField extends FieldType {
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
if (alternate != null) {
if (alternate.equalsIgnoreCase("shifted")) {
rbc.setAlternateHandlingShifted(true);
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
rbc.setAlternateHandlingShifted(false);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
}
}
if (caseLevel != null) {
rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
}
if (caseFirst != null) {
if (caseFirst.equalsIgnoreCase("lower")) {
rbc.setLowerCaseFirst(true);
} else if (caseFirst.equalsIgnoreCase("upper")) {
rbc.setUpperCaseFirst(true);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
}
}
if (numeric != null) {
rbc.setNumericCollation(Boolean.parseBoolean(numeric));
}
if (variableTop != null) {
rbc.setVariableTop(variableTop);
}
// we use 4.0 because it ensures we just encode the pure byte[] keys.
analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
}

View File

@ -0,0 +1,61 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for ICUCollationKeyFilter (deprecated: use ICUCollationField instead) -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_ar_t" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.ICUCollationKeyFilterFactory" locale="ar"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_de_t" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.ICUCollationKeyFilterFactory" locale="de" strength="primary"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ar"/>
<copyField source="text" dest="sort_de"/>
</schema>

View File

@ -0,0 +1,69 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for CollationField options -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<!-- ignores punctuation and whitespace -->
<fieldtype name="sort_ignore_punctuation_t" class="solr.ICUCollationField"
locale="en" strength="primary" alternate="shifted"/>
<!-- ignores only whitespace -->
<fieldtype name="sort_ignore_space_t" class="solr.ICUCollationField"
locale="en" strength="primary" alternate="shifted" variableTop=" "/>
<!-- ignores only accents, but not case -->
<fieldtype name="sort_ignore_accents_t" class="solr.ICUCollationField"
locale="en" strength="primary" caseLevel="true"/>
<!-- sorts numerics in numeric order -->
<fieldtype name="sort_numerics_t" class="solr.ICUCollationField"
locale="en" numeric="true"/>
<!-- sorts uppercase before lowercase -->
<fieldtype name="sort_uppercase_first_t" class="solr.ICUCollationField"
locale="en" strength="tertiary" caseFirst="upper"/>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ignore_punctuation" type="sort_ignore_punctuation_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_ignore_space" type="sort_ignore_space_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_ignore_accents" type="sort_ignore_accents_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_numerics" type="sort_numerics_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_uppercase_first" type="sort_uppercase_first_t" indexed="true" stored="false" multiValued="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ignore_punctuation"/>
<copyField source="text" dest="sort_ignore_space"/>
<copyField source="text" dest="sort_ignore_accents"/>
<copyField source="text" dest="sort_numerics"/>
<copyField source="text" dest="sort_uppercase_first"/>
</schema>

View File

@ -98,6 +98,133 @@ public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
public void testIgnorePunctuation() throws IOException {
String withPunctuation = "foo-bar";
String withoutPunctuation = "foo bar";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "primary");
args.put("alternate", "shifted");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withPunctuation)));
TokenStream tsWithoutPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withoutPunctuation)));
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
public void testIgnoreWhitespace() throws IOException {
String withSpace = "foo bar";
String withoutSpace = "foobar";
String withPunctuation = "foo-bar";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "primary");
args.put("alternate", "shifted");
args.put("variableTop", " ");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsWithSpace = factory.create(
new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithoutSpace = factory.create(
new KeywordTokenizer(new StringReader(withoutSpace)));
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
// now assert that punctuation still matters: foo-bar < foo bar
tsWithSpace = factory.create(
new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithPunctuation = factory.create(
new KeywordTokenizer(new StringReader(withPunctuation)));
assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
public void testNumerics() throws IOException {
String nine = "foobar-9";
String ten = "foobar-10";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("numeric", "true");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsNine = factory.create(
new KeywordTokenizer(new StringReader(nine)));
TokenStream tsTen = factory.create(
new KeywordTokenizer(new StringReader(ten)));
assertCollation(tsNine, tsTen, -1);
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
public void testIgnoreAccentsButNotCase() throws IOException {
String withAccents = "résumé";
String withoutAccents = "resume";
String withAccentsUpperCase = "Résumé";
String withoutAccentsUpperCase = "Resume";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "primary");
args.put("caseLevel", "true");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsWithAccents = factory.create(
new KeywordTokenizer(new StringReader(withAccents)));
TokenStream tsWithoutAccents = factory.create(
new KeywordTokenizer(new StringReader(withoutAccents)));
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
TokenStream tsWithAccentsUpperCase = factory.create(
new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
TokenStream tsWithoutAccentsUpperCase = factory.create(
new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
// now assert that case still matters: resume < Resume
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(withoutAccents)));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollation(tsLower, tsUpper, -1);
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
public void testUpperCaseFirst() throws IOException {
String lower = "resume";
String upper = "Resume";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "tertiary");
args.put("caseFirst", "upper");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(lower)));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(upper)));
assertCollation(tsUpper, tsLower, -1);
}
/*
* For german, you might want oe to sort and match with o umlaut.
@ -156,15 +283,18 @@ public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
}
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
throws IOException {
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
assertCollation(stream1, stream2, 0);
}
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());
assertEquals(term1.toString(), term2.toString());
assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
assertFalse(stream1.incrementToken());
assertFalse(stream2.incrementToken());
}

View File

@ -0,0 +1,84 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
/**
* Tests {@link ICUCollationKeyFilterFactory} with RangeQueries
*/
public class TestICUCollationKeyRangeQueries extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-icucollate.xml","schema-icucollatefilter.xml", "analysis-extras/solr");
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
assertU(adoc("id", "4", "text", "Töne"));
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
assertU(adoc("id", "6", "text", ""));
assertU(adoc("id", "7", "text", "Tone"));
assertU(adoc("id", "8", "text", "Testing"));
assertU(adoc("id", "9", "text", "testing"));
assertU(adoc("id", "10", "text", "toene"));
assertU(adoc("id", "11", "text", "Tzne"));
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
/**
* Test termquery with german DIN 5007-1 primary strength.
* In this case, ö is equivalent to o (but not oe)
*/
public void testBasicTermQuery() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with the DIN 5007-1 collator.
* We do a range query of tone .. tp, in binary order this
* would retrieve nothing due to case and accent differences.
*/
public void testBasicRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with an Arabic collator.
* Binary order would normally order U+0633 in this range.
*/
public void testNegativeRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
"//*[@numFound='0']"
);
}
}

View File

@ -0,0 +1,117 @@
package org.apache.solr.schema;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.codecs.Codec;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
/**
* Tests expert options of {@link ICUCollationField}.
*/
public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
assumeFalse("preflex format only supports UTF-8 encoded bytes", "Lucene3x".equals(Codec.getDefault().getName()));
initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml", "analysis-extras/solr");
// add some docs
assertU(adoc("id", "1", "text", "foo-bar"));
assertU(adoc("id", "2", "text", "foo bar"));
assertU(adoc("id", "3", "text", "foobar"));
assertU(adoc("id", "4", "text", "foobar-10"));
assertU(adoc("id", "5", "text", "foobar-9"));
assertU(adoc("id", "6", "text", "resume"));
assertU(adoc("id", "7", "text", "Résumé"));
assertU(adoc("id", "8", "text", "Resume"));
assertU(adoc("id", "9", "text", "résumé"));
assertU(commit());
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
public void testIgnorePunctuation() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id asc" ),
"//*[@numFound='3']",
"//result/doc[1]/int[@name='id'][.=1]",
"//result/doc[2]/int[@name='id'][.=2]",
"//result/doc[3]/int[@name='id'][.=3]"
);
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
public void testIgnoreWhitespace() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=2]",
"//result/doc[2]/int[@name='id'][.=3]"
);
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
public void testNumerics() {
assertQ("Collated sort: ",
req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=5]",
"//result/doc[2]/int[@name='id'][.=4]"
);
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
public void testIgnoreAccentsButNotCase() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=6]",
"//result/doc[2]/int[@name='id'][.=9]"
);
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=7]",
"//result/doc[2]/int[@name='id'][.=8]"
);
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
public void testUpperCaseFirst() {
assertQ("Collated sort: ",
req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=8]",
"//result/doc[2]/int[@name='id'][.=6]"
);
}
}

View File

@ -72,7 +72,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
* @deprecated use {@link org.apache.solr.schema.CollationField} instead.
*/
@Deprecated
public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
@ -171,4 +171,9 @@ public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements
IOUtils.closeQuietly(input);
}
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -0,0 +1,61 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for CollationKeyFilter (deprecated: use CollationField instead) -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_ar_t" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.CollationKeyFilterFactory" language="ar"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_de_t" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.CollationKeyFilterFactory" language="de" strength="primary"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ar"/>
<copyField source="text" dest="sort_de"/>
</schema>

View File

@ -0,0 +1,84 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
/**
* Tests {@link CollationKeyFilterFactory} with RangeQueries
*/
public class TestCollationKeyRangeQueries extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-basic.xml","schema-collatefilter.xml");
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
assertU(adoc("id", "4", "text", "Töne"));
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
assertU(adoc("id", "6", "text", ""));
assertU(adoc("id", "7", "text", "Tone"));
assertU(adoc("id", "8", "text", "Testing"));
assertU(adoc("id", "9", "text", "testing"));
assertU(adoc("id", "10", "text", "toene"));
assertU(adoc("id", "11", "text", "Tzne"));
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
/**
* Test termquery with german DIN 5007-1 primary strength.
* In this case, ö is equivalent to o (but not oe)
*/
public void testBasicTermQuery() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with the DIN 5007-1 collator.
* We do a range query of tone .. tp, in binary order this
* would retrieve nothing due to case and accent differences.
*/
public void testBasicRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with an Arabic collator.
* Binary order would normally order U+0633 in this range.
*/
public void testNegativeRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
"//*[@numFound='0']"
);
}
}