mirror of https://github.com/apache/lucene.git
SOLR-2919: Localized rangequery support for icu/jdk collation filters, parametric tailoring for icu collators
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1207070 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d949201403
commit
4dd6b5fbed
|
@ -395,6 +395,8 @@ New Features
|
|||
"multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
|
||||
specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
|
||||
|
||||
* SOLR-2919: Added support for localized range queries when the analysis chain uses
|
||||
CollationKeyFilter or ICUCollationKeyFilter. (Michael Sokolov, rmuir)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
|
|
@ -17,6 +17,13 @@ $Id$
|
|||
the Solr 3.x ICUCollationKeyFilterFactory, and also supports
|
||||
Locale-sensitive range queries. (rmuir)
|
||||
|
||||
================== 3.6.0 ==================
|
||||
|
||||
* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.
|
||||
These can be used to customize range query/sort behavior, for example to
|
||||
support numeric collation, ignore punctuation/whitespace, ignore accents but
|
||||
not case, control whether upper/lowercase values are sorted first, etc. (rmuir)
|
||||
|
||||
================== 3.5.0 ==================
|
||||
|
||||
(No Changes)
|
||||
|
|
|
@ -53,6 +53,15 @@ import com.ibm.icu.util.ULocale;
|
|||
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
|
||||
* <li>decomposition: 'no' or 'canonical' (optional)
|
||||
* </ul>
|
||||
* <p>
|
||||
* Expert options:
|
||||
* <ul>
|
||||
* <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
|
||||
* <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
|
||||
* <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
|
||||
* <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
|
||||
* <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
|
||||
* </ul>
|
||||
*
|
||||
* @see Collator
|
||||
* @see ULocale
|
||||
|
@ -60,7 +69,7 @@ import com.ibm.icu.util.ULocale;
|
|||
* @deprecated use {@link org.apache.solr.schema.ICUCollationField} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
|
||||
private Collator collator;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
|
@ -68,6 +77,12 @@ public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory impleme
|
|||
String localeID = args.get("locale");
|
||||
String strength = args.get("strength");
|
||||
String decomposition = args.get("decomposition");
|
||||
|
||||
String alternate = args.get("alternate");
|
||||
String caseLevel = args.get("caseLevel");
|
||||
String caseFirst = args.get("caseFirst");
|
||||
String numeric = args.get("numeric");
|
||||
String variableTop = args.get("variableTop");
|
||||
|
||||
if (custom == null && localeID == null)
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
|
||||
|
@ -110,6 +125,36 @@ public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory impleme
|
|||
else
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
|
||||
}
|
||||
|
||||
// expert options: concrete subclasses are always a RuleBasedCollator
|
||||
RuleBasedCollator rbc = (RuleBasedCollator) collator;
|
||||
if (alternate != null) {
|
||||
if (alternate.equalsIgnoreCase("shifted")) {
|
||||
rbc.setAlternateHandlingShifted(true);
|
||||
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
|
||||
rbc.setAlternateHandlingShifted(false);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
|
||||
}
|
||||
}
|
||||
if (caseLevel != null) {
|
||||
rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
|
||||
}
|
||||
if (caseFirst != null) {
|
||||
if (caseFirst.equalsIgnoreCase("lower")) {
|
||||
rbc.setLowerCaseFirst(true);
|
||||
} else if (caseFirst.equalsIgnoreCase("upper")) {
|
||||
rbc.setUpperCaseFirst(true);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
|
||||
}
|
||||
}
|
||||
if (numeric != null) {
|
||||
rbc.setNumericCollation(Boolean.parseBoolean(numeric));
|
||||
}
|
||||
if (variableTop != null) {
|
||||
rbc.setVariableTop(variableTop);
|
||||
}
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
|
@ -141,4 +186,9 @@ public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory impleme
|
|||
IOUtils.closeQuietly(input);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,7 +66,16 @@ import com.ibm.icu.util.ULocale;
|
|||
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
|
||||
* <li>decomposition: 'no' or 'canonical' (optional)
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* Expert options:
|
||||
* <ul>
|
||||
* <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
|
||||
* <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
|
||||
* <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
|
||||
* <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
|
||||
* <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
|
||||
* </ul>
|
||||
*
|
||||
* @see Collator
|
||||
* @see ULocale
|
||||
* @see RuleBasedCollator
|
||||
|
@ -90,6 +99,12 @@ public class ICUCollationField extends FieldType {
|
|||
String strength = args.remove("strength");
|
||||
String decomposition = args.remove("decomposition");
|
||||
|
||||
String alternate = args.remove("alternate");
|
||||
String caseLevel = args.remove("caseLevel");
|
||||
String caseFirst = args.remove("caseFirst");
|
||||
String numeric = args.remove("numeric");
|
||||
String variableTop = args.remove("variableTop");
|
||||
|
||||
if (custom == null && localeID == null)
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
|
||||
|
||||
|
@ -133,6 +148,37 @@ public class ICUCollationField extends FieldType {
|
|||
else
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
|
||||
}
|
||||
|
||||
// expert options: concrete subclasses are always a RuleBasedCollator
|
||||
RuleBasedCollator rbc = (RuleBasedCollator) collator;
|
||||
if (alternate != null) {
|
||||
if (alternate.equalsIgnoreCase("shifted")) {
|
||||
rbc.setAlternateHandlingShifted(true);
|
||||
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
|
||||
rbc.setAlternateHandlingShifted(false);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
|
||||
}
|
||||
}
|
||||
if (caseLevel != null) {
|
||||
rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
|
||||
}
|
||||
if (caseFirst != null) {
|
||||
if (caseFirst.equalsIgnoreCase("lower")) {
|
||||
rbc.setLowerCaseFirst(true);
|
||||
} else if (caseFirst.equalsIgnoreCase("upper")) {
|
||||
rbc.setUpperCaseFirst(true);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
|
||||
}
|
||||
}
|
||||
if (numeric != null) {
|
||||
rbc.setNumericCollation(Boolean.parseBoolean(numeric));
|
||||
}
|
||||
if (variableTop != null) {
|
||||
rbc.setVariableTop(variableTop);
|
||||
}
|
||||
|
||||
// we use 4.0 because it ensures we just encode the pure byte[] keys.
|
||||
analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Test schema file for ICUCollationKeyFilter (deprecated: use ICUCollationField instead) -->
|
||||
|
||||
<schema name="test" version="1.0">
|
||||
<types>
|
||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||
|
||||
<!-- basic text field -->
|
||||
<fieldtype name="text" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="sort_ar_t" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.ICUCollationKeyFilterFactory" locale="ar"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="sort_de_t" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.ICUCollationKeyFilterFactory" locale="de" strength="primary"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
|
||||
<field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!-- copy our text to some sort fields with different orders -->
|
||||
<copyField source="text" dest="sort_ar"/>
|
||||
<copyField source="text" dest="sort_de"/>
|
||||
</schema>
|
|
@ -0,0 +1,69 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Test schema file for CollationField options -->
|
||||
|
||||
<schema name="test" version="1.0">
|
||||
<types>
|
||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||
|
||||
<!-- basic text field -->
|
||||
<fieldtype name="text" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<!-- ignores punctuation and whitespace -->
|
||||
<fieldtype name="sort_ignore_punctuation_t" class="solr.ICUCollationField"
|
||||
locale="en" strength="primary" alternate="shifted"/>
|
||||
<!-- ignores only whitespace -->
|
||||
<fieldtype name="sort_ignore_space_t" class="solr.ICUCollationField"
|
||||
locale="en" strength="primary" alternate="shifted" variableTop=" "/>
|
||||
<!-- ignores only accents, but not case -->
|
||||
<fieldtype name="sort_ignore_accents_t" class="solr.ICUCollationField"
|
||||
locale="en" strength="primary" caseLevel="true"/>
|
||||
<!-- sorts numerics in numeric order -->
|
||||
<fieldtype name="sort_numerics_t" class="solr.ICUCollationField"
|
||||
locale="en" numeric="true"/>
|
||||
<!-- sorts uppercase before lowercase -->
|
||||
<fieldtype name="sort_uppercase_first_t" class="solr.ICUCollationField"
|
||||
locale="en" strength="tertiary" caseFirst="upper"/>
|
||||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="sort_ignore_punctuation" type="sort_ignore_punctuation_t" indexed="true" stored="false" multiValued="false"/>
|
||||
<field name="sort_ignore_space" type="sort_ignore_space_t" indexed="true" stored="false" multiValued="false"/>
|
||||
<field name="sort_ignore_accents" type="sort_ignore_accents_t" indexed="true" stored="false" multiValued="false"/>
|
||||
<field name="sort_numerics" type="sort_numerics_t" indexed="true" stored="false" multiValued="false"/>
|
||||
<field name="sort_uppercase_first" type="sort_uppercase_first_t" indexed="true" stored="false" multiValued="false"/>
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!-- copy our text to some sort fields with different orders -->
|
||||
<copyField source="text" dest="sort_ignore_punctuation"/>
|
||||
<copyField source="text" dest="sort_ignore_space"/>
|
||||
<copyField source="text" dest="sort_ignore_accents"/>
|
||||
<copyField source="text" dest="sort_numerics"/>
|
||||
<copyField source="text" dest="sort_uppercase_first"/>
|
||||
</schema>
|
|
@ -98,6 +98,133 @@ public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
|
|||
new KeywordTokenizer(new StringReader(lowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted to shift whitespace, punctuation and symbols
|
||||
* to quaternary level
|
||||
*/
|
||||
public void testIgnorePunctuation() throws IOException {
|
||||
String withPunctuation = "foo-bar";
|
||||
String withoutPunctuation = "foo bar";
|
||||
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("locale", "en");
|
||||
args.put("strength", "primary");
|
||||
args.put("alternate", "shifted");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsPunctuation = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withPunctuation)));
|
||||
TokenStream tsWithoutPunctuation = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withoutPunctuation)));
|
||||
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted and variableTop to shift whitespace, but not
|
||||
* punctuation or symbols, to quaternary level
|
||||
*/
|
||||
public void testIgnoreWhitespace() throws IOException {
|
||||
String withSpace = "foo bar";
|
||||
String withoutSpace = "foobar";
|
||||
String withPunctuation = "foo-bar";
|
||||
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("locale", "en");
|
||||
args.put("strength", "primary");
|
||||
args.put("alternate", "shifted");
|
||||
args.put("variableTop", " ");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsWithSpace = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withSpace)));
|
||||
TokenStream tsWithoutSpace = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withoutSpace)));
|
||||
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
|
||||
// now assert that punctuation still matters: foo-bar < foo bar
|
||||
tsWithSpace = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withSpace)));
|
||||
TokenStream tsWithPunctuation = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withPunctuation)));
|
||||
assertCollation(tsWithPunctuation, tsWithSpace, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting numeric to encode digits with numeric value, so that
|
||||
* foobar-9 sorts before foobar-10
|
||||
*/
|
||||
public void testNumerics() throws IOException {
|
||||
String nine = "foobar-9";
|
||||
String ten = "foobar-10";
|
||||
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("locale", "en");
|
||||
args.put("numeric", "true");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsNine = factory.create(
|
||||
new KeywordTokenizer(new StringReader(nine)));
|
||||
TokenStream tsTen = factory.create(
|
||||
new KeywordTokenizer(new StringReader(ten)));
|
||||
assertCollation(tsNine, tsTen, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseLevel=true to create an additional case level between
|
||||
* secondary and tertiary
|
||||
*/
|
||||
public void testIgnoreAccentsButNotCase() throws IOException {
|
||||
String withAccents = "résumé";
|
||||
String withoutAccents = "resume";
|
||||
String withAccentsUpperCase = "Résumé";
|
||||
String withoutAccentsUpperCase = "Resume";
|
||||
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("locale", "en");
|
||||
args.put("strength", "primary");
|
||||
args.put("caseLevel", "true");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsWithAccents = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withAccents)));
|
||||
TokenStream tsWithoutAccents = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withoutAccents)));
|
||||
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
|
||||
|
||||
TokenStream tsWithAccentsUpperCase = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
|
||||
TokenStream tsWithoutAccentsUpperCase = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
|
||||
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
|
||||
|
||||
// now assert that case still matters: resume < Resume
|
||||
TokenStream tsLower = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withoutAccents)));
|
||||
TokenStream tsUpper = factory.create(
|
||||
new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
|
||||
assertCollation(tsLower, tsUpper, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseFirst=upper to cause uppercase strings to sort
|
||||
* before lowercase ones.
|
||||
*/
|
||||
public void testUpperCaseFirst() throws IOException {
|
||||
String lower = "resume";
|
||||
String upper = "Resume";
|
||||
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("locale", "en");
|
||||
args.put("strength", "tertiary");
|
||||
args.put("caseFirst", "upper");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsLower = factory.create(
|
||||
new KeywordTokenizer(new StringReader(lower)));
|
||||
TokenStream tsUpper = factory.create(
|
||||
new KeywordTokenizer(new StringReader(upper)));
|
||||
assertCollation(tsUpper, tsLower, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* For german, you might want oe to sort and match with o umlaut.
|
||||
|
@ -156,15 +283,18 @@ public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
|
||||
throws IOException {
|
||||
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
|
||||
assertCollation(stream1, stream2, 0);
|
||||
}
|
||||
|
||||
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
|
||||
CharTermAttribute term1 = stream1
|
||||
.addAttribute(CharTermAttribute.class);
|
||||
CharTermAttribute term2 = stream2
|
||||
.addAttribute(CharTermAttribute.class);
|
||||
assertTrue(stream1.incrementToken());
|
||||
assertTrue(stream2.incrementToken());
|
||||
assertEquals(term1.toString(), term2.toString());
|
||||
assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
|
||||
assertFalse(stream1.incrementToken());
|
||||
assertFalse(stream2.incrementToken());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Tests {@link ICUCollationKeyFilterFactory} with RangeQueries
|
||||
*/
|
||||
public class TestICUCollationKeyRangeQueries extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-icucollate.xml","schema-icucollatefilter.xml", "analysis-extras/solr");
|
||||
// add some docs
|
||||
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
|
||||
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
|
||||
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
|
||||
assertU(adoc("id", "4", "text", "Töne"));
|
||||
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
|
||||
assertU(adoc("id", "6", "text", "Testing"));
|
||||
assertU(adoc("id", "7", "text", "Tone"));
|
||||
assertU(adoc("id", "8", "text", "Testing"));
|
||||
assertU(adoc("id", "9", "text", "testing"));
|
||||
assertU(adoc("id", "10", "text", "toene"));
|
||||
assertU(adoc("id", "11", "text", "Tzne"));
|
||||
assertU(adoc("id", "12", "text", "\u0698\u0698"));
|
||||
assertU(commit());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test termquery with german DIN 5007-1 primary strength.
|
||||
* In this case, ö is equivalent to o (but not oe)
|
||||
*/
|
||||
public void testBasicTermQuery() {
|
||||
assertQ("Collated TQ: ",
|
||||
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=4]",
|
||||
"//result/doc[2]/int[@name='id'][.=7]"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test rangequery again with the DIN 5007-1 collator.
|
||||
* We do a range query of tone .. tp, in binary order this
|
||||
* would retrieve nothing due to case and accent differences.
|
||||
*/
|
||||
public void testBasicRangeQuery() {
|
||||
assertQ("Collated RangeQ: ",
|
||||
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=4]",
|
||||
"//result/doc[2]/int[@name='id'][.=7]"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test rangequery again with an Arabic collator.
|
||||
* Binary order would normally order U+0633 in this range.
|
||||
*/
|
||||
public void testNegativeRangeQuery() {
|
||||
assertQ("Collated RangeQ: ",
|
||||
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
|
||||
"//*[@numFound='0']"
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package org.apache.solr.schema;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.codecs.Codec;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Tests expert options of {@link ICUCollationField}.
|
||||
*/
|
||||
public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
assumeFalse("preflex format only supports UTF-8 encoded bytes", "Lucene3x".equals(Codec.getDefault().getName()));
|
||||
initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml", "analysis-extras/solr");
|
||||
// add some docs
|
||||
assertU(adoc("id", "1", "text", "foo-bar"));
|
||||
assertU(adoc("id", "2", "text", "foo bar"));
|
||||
assertU(adoc("id", "3", "text", "foobar"));
|
||||
assertU(adoc("id", "4", "text", "foobar-10"));
|
||||
assertU(adoc("id", "5", "text", "foobar-9"));
|
||||
assertU(adoc("id", "6", "text", "resume"));
|
||||
assertU(adoc("id", "7", "text", "Résumé"));
|
||||
assertU(adoc("id", "8", "text", "Resume"));
|
||||
assertU(adoc("id", "9", "text", "résumé"));
|
||||
assertU(commit());
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted to shift whitespace, punctuation and symbols
|
||||
* to quaternary level
|
||||
*/
|
||||
public void testIgnorePunctuation() {
|
||||
assertQ("Collated TQ: ",
|
||||
req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id asc" ),
|
||||
"//*[@numFound='3']",
|
||||
"//result/doc[1]/int[@name='id'][.=1]",
|
||||
"//result/doc[2]/int[@name='id'][.=2]",
|
||||
"//result/doc[3]/int[@name='id'][.=3]"
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted and variableTop to shift whitespace, but not
|
||||
* punctuation or symbols, to quaternary level
|
||||
*/
|
||||
public void testIgnoreWhitespace() {
|
||||
assertQ("Collated TQ: ",
|
||||
req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=2]",
|
||||
"//result/doc[2]/int[@name='id'][.=3]"
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting numeric to encode digits with numeric value, so that
|
||||
* foobar-9 sorts before foobar-10
|
||||
*/
|
||||
public void testNumerics() {
|
||||
assertQ("Collated sort: ",
|
||||
req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=5]",
|
||||
"//result/doc[2]/int[@name='id'][.=4]"
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseLevel=true to create an additional case level between
|
||||
* secondary and tertiary
|
||||
*/
|
||||
public void testIgnoreAccentsButNotCase() {
|
||||
assertQ("Collated TQ: ",
|
||||
req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=6]",
|
||||
"//result/doc[2]/int[@name='id'][.=9]"
|
||||
);
|
||||
|
||||
assertQ("Collated TQ: ",
|
||||
req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=7]",
|
||||
"//result/doc[2]/int[@name='id'][.=8]"
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseFirst=upper to cause uppercase strings to sort
|
||||
* before lowercase ones.
|
||||
*/
|
||||
public void testUpperCaseFirst() {
|
||||
assertQ("Collated sort: ",
|
||||
req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=8]",
|
||||
"//result/doc[2]/int[@name='id'][.=6]"
|
||||
);
|
||||
}
|
||||
}
|
|
@ -72,7 +72,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
|||
* @deprecated use {@link org.apache.solr.schema.CollationField} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
|
||||
private Collator collator;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
|
@ -171,4 +171,9 @@ public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements
|
|||
IOUtils.closeQuietly(input);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Test schema file for CollationKeyFilter (deprecated: use CollationField instead) -->
|
||||
|
||||
<schema name="test" version="1.0">
|
||||
<types>
|
||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||
|
||||
<!-- basic text field -->
|
||||
<fieldtype name="text" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="sort_ar_t" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.CollationKeyFilterFactory" language="ar"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="sort_de_t" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.CollationKeyFilterFactory" language="de" strength="primary"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
|
||||
<field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!-- copy our text to some sort fields with different orders -->
|
||||
<copyField source="text" dest="sort_ar"/>
|
||||
<copyField source="text" dest="sort_de"/>
|
||||
</schema>
|
|
@ -0,0 +1,84 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Tests {@link CollationKeyFilterFactory} with RangeQueries
|
||||
*/
|
||||
public class TestCollationKeyRangeQueries extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-basic.xml","schema-collatefilter.xml");
|
||||
// add some docs
|
||||
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
|
||||
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
|
||||
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
|
||||
assertU(adoc("id", "4", "text", "Töne"));
|
||||
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
|
||||
assertU(adoc("id", "6", "text", "Testing"));
|
||||
assertU(adoc("id", "7", "text", "Tone"));
|
||||
assertU(adoc("id", "8", "text", "Testing"));
|
||||
assertU(adoc("id", "9", "text", "testing"));
|
||||
assertU(adoc("id", "10", "text", "toene"));
|
||||
assertU(adoc("id", "11", "text", "Tzne"));
|
||||
assertU(adoc("id", "12", "text", "\u0698\u0698"));
|
||||
assertU(commit());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test termquery with german DIN 5007-1 primary strength.
|
||||
* In this case, ö is equivalent to o (but not oe)
|
||||
*/
|
||||
public void testBasicTermQuery() {
|
||||
assertQ("Collated TQ: ",
|
||||
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=4]",
|
||||
"//result/doc[2]/int[@name='id'][.=7]"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test rangequery again with the DIN 5007-1 collator.
|
||||
* We do a range query of tone .. tp, in binary order this
|
||||
* would retrieve nothing due to case and accent differences.
|
||||
*/
|
||||
public void testBasicRangeQuery() {
|
||||
assertQ("Collated RangeQ: ",
|
||||
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
|
||||
"//*[@numFound='2']",
|
||||
"//result/doc[1]/int[@name='id'][.=4]",
|
||||
"//result/doc[2]/int[@name='id'][.=7]"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test rangequery again with an Arabic collator.
|
||||
* Binary order would normally order U+0633 in this range.
|
||||
*/
|
||||
public void testNegativeRangeQuery() {
|
||||
assertQ("Collated RangeQ: ",
|
||||
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
|
||||
"//*[@numFound='0']"
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue