SOLR-2396: add [ICU]CollationField

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1086637 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-03-29 16:37:39 +00:00
parent 447b497c57
commit 4bfb56d42b
15 changed files with 1044 additions and 8 deletions

View File

@ -114,6 +114,10 @@ New Features
* SOLR-2417: Add explain info directly to return documents using ?fl=_explain_ (ryan)
* SOLR-2396: Add CollationField, which is much more efficient than
the Solr 3.x CollationKeyFilterFactory, and also supports
Locale-sensitive range queries. (rmuir)
Optimizations
----------------------

View File

@ -13,7 +13,9 @@ analyzers for Chinese and Polish.
$Id$
================== Release 4.0-dev ==================
(No Changes)
* SOLR-2396: Add ICUCollationField, which is much more efficient than
the Solr 3.x ICUCollationKeyFilterFactory, and also supports
Locale-sensitive range queries. (rmuir)
================== Release 3.2-dev ==================

View File

@ -57,7 +57,9 @@ import com.ibm.icu.util.ULocale;
* @see Collator
* @see ULocale
* @see RuleBasedCollator
* @deprecated use {@link org.apache.solr.schema.ICUCollationField} instead.
*/
@Deprecated
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Collator collator;

View File

@ -0,0 +1,228 @@
package org.apache.solr.schema;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
/**
* Field for collated sort keys.
* These can be used for locale-sensitive sort and range queries.
* <p>
* This field can be created in two ways:
* <ul>
* <li>Based upon a system collator associated with a Locale.
* <li>Based upon a tailored ruleset.
* </ul>
* <p>
* Using a System collator:
* <ul>
* <li>locale: RFC 3066 locale ID (mandatory)
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no', or 'canonical' (optional)
* </ul>
* <p>
* Using a Tailored ruleset:
* <ul>
* <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
*
* @see Collator
* @see ULocale
* @see RuleBasedCollator
*/
public class ICUCollationField extends FieldType {
private Analyzer analyzer;
@Override
protected void init(IndexSchema schema, Map<String,String> args) {
properties |= TOKENIZED; // this ensures our analyzer gets hit
setup(schema.getResourceLoader(), args);
super.init(schema, args);
}
/**
* Setup the field according to the provided parameters
*/
private void setup(ResourceLoader loader, Map<String,String> args) {
String custom = args.remove("custom");
String localeID = args.remove("locale");
String strength = args.remove("strength");
String decomposition = args.remove("decomposition");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
if (custom != null && localeID != null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+ "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ "Then save the entire customized ruleset to a file, and use with the custom parameter");
final Collator collator;
if (localeID != null) {
// create from a system collator, based on Locale.
collator = createFromLocale(localeID);
} else {
// create from a custom ruleset
collator = createFromRules(custom, loader);
}
// set the strength flag, otherwise it will be the default.
if (strength != null) {
if (strength.equalsIgnoreCase("primary"))
collator.setStrength(Collator.PRIMARY);
else if (strength.equalsIgnoreCase("secondary"))
collator.setStrength(Collator.SECONDARY);
else if (strength.equalsIgnoreCase("tertiary"))
collator.setStrength(Collator.TERTIARY);
else if (strength.equalsIgnoreCase("quaternary"))
collator.setStrength(Collator.QUATERNARY);
else if (strength.equalsIgnoreCase("identical"))
collator.setStrength(Collator.IDENTICAL);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
}
// set the decomposition flag, otherwise it will be the default.
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no"))
collator.setDecomposition(Collator.NO_DECOMPOSITION);
else if (decomposition.equalsIgnoreCase("canonical"))
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
// we use 4.0 because it ensures we just encode the pure byte[] keys.
analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
}
/**
* Create a locale from localeID.
* Then return the appropriate collator for the locale.
*/
private Collator createFromLocale(String localeID) {
return Collator.getInstance(new ULocale(localeID));
}
/**
* Read custom rules from a file, and create a RuleBasedCollator
* The file cannot support comments, as # might be in the rules!
*/
private Collator createFromRules(String fileName, ResourceLoader loader) {
InputStream input = null;
try {
input = loader.openResource(fileName);
String rules = IOUtils.toString(input, "UTF-8");
return new RuleBasedCollator(rules);
} catch (Exception e) {
// io error or invalid rules
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(input);
}
}
@Override
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
writer.writeStr(name, f.stringValue(), true);
}
@Override
public SortField getSortField(SchemaField field, boolean top) {
return getStringSort(field, top);
}
@Override
public Analyzer getAnalyzer() {
return analyzer;
}
@Override
public Analyzer getQueryAnalyzer() {
return analyzer;
}
/**
* analyze the range with the analyzer, instead of the collator.
* because icu collators are not thread safe, this keeps things
* simple (we already have a threadlocal clone in the reused TS)
*/
private BytesRef analyzeRangePart(String field, String part) {
TokenStream source;
try {
source = analyzer.reusableTokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
source = analyzer.tokenStream(field, new StringReader(part));
}
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
// we control the analyzer here: most errors are impossible
try {
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
termAtt.fillBytesRef();
assert !source.incrementToken();
} catch (IOException e) {
throw new RuntimeException("error analyzing range part: " + part, e);
}
try {
source.close();
} catch (IOException ignored) {}
return new BytesRef(bytes);
}
@Override
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
String f = field.getName();
BytesRef low = part1 == null ? null : analyzeRangePart(f, part1);
BytesRef high = part2 == null ? null : analyzeRangePart(f, part2);
return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
}
}

View File

@ -1,4 +0,0 @@
Please don't remove this silly file!
This is here to make sure the dir is not empty... otherwise Ht/git
clones have problems.

View File

@ -0,0 +1,59 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for CollationField -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_ar_t" class="solr.ICUCollationField" locale="ar"/>
<fieldtype name="sort_de_t" class="solr.ICUCollationField" locale="de" strength="primary"/>
<fieldtype name="sort_tr_canon_t" class="solr.ICUCollationField" locale="tr" strength="primary" decomposition="canonical"/>
<fieldtype name="sort_da_t" class="solr.ICUCollationField" locale="da" strength="primary"/>
<fieldtype name="sort_custom_t" class="solr.ICUCollationField" custom="customrules.dat" strength="primary"/>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_tr_canon" type="sort_tr_canon_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_da" type="sort_da_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_custom" type="sort_custom_t" indexed="true" stored="false" multiValued="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ar"/>
<copyField source="text" dest="sort_de"/>
<copyField source="text" dest="sort_tr_canon"/>
<copyField source="text" dest="sort_da"/>
<copyField source="text" dest="sort_custom"/>
</schema>

View File

@ -0,0 +1,23 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<config>
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
<requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
</config>

View File

@ -34,6 +34,7 @@ import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
@Deprecated
public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
/*

View File

@ -0,0 +1,186 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import java.io.File;
import java.io.FileOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
/**
* Tests {@link ICUCollationField} with TermQueries, RangeQueries, and sort order.
*/
public class TestICUCollationField extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
String home = setupSolrHome();
initCore("solrconfig.xml","schema.xml", home);
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
assertU(adoc("id", "4", "text", "Töne"));
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
assertU(adoc("id", "6", "text", ""));
assertU(adoc("id", "7", "text", "Tone"));
assertU(adoc("id", "8", "text", "Testing"));
assertU(adoc("id", "9", "text", "testing"));
assertU(adoc("id", "10", "text", "toene"));
assertU(adoc("id", "11", "text", "Tzne"));
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
/**
* Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
* These are largish files, and jvm-specific (as our documentation says, you should always
* look out for jvm differences with collation).
* So its preferable to create this file on-the-fly.
*/
public static String setupSolrHome() throws Exception {
// make a solr home underneath the test's TEMP_DIR
File tmpFile = File.createTempFile("test", "tmp", TEMP_DIR);
tmpFile.delete();
tmpFile.mkdir();
// make data and conf dirs
new File(tmpFile, "data").mkdir();
File confDir = new File(tmpFile, "conf");
confDir.mkdir();
// copy over configuration files
FileUtils.copyFile(getFile("solr-analysis-extras/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
FileUtils.copyFile(getFile("solr-analysis-extras/conf/schema-icucollate.xml"), new File(confDir, "schema.xml"));
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
IOUtils.write(tailoredRules, os, "UTF-8");
os.close();
return tmpFile.getAbsolutePath();
}
/**
* Test termquery with german DIN 5007-1 primary strength.
* In this case, ö is equivalent to o (but not oe)
*/
public void testBasicTermQuery() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with the DIN 5007-1 collator.
* We do a range query of tone .. tp, in binary order this
* would retrieve nothing due to case and accent differences.
*/
public void testBasicRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test sort with a danish collator. ö is ordered after z
*/
public void testBasicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=11]",
"//result/doc[2]/int[@name='id'][.=4]"
);
}
/**
* Test sort with an arabic collator. U+0633 is ordered after U+0698.
* With a binary collator, the range would also return nothing.
*/
public void testArabicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=12]",
"//result/doc[2]/int[@name='id'][.=1]"
);
}
/**
* Test rangequery again with an Arabic collator.
* Binary order would normally order U+0633 in this range.
*/
public void testNegativeRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
"//*[@numFound='0']"
);
}
/**
* Test canonical decomposition with turkish primary strength.
* With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
* We index a decomposed form of İ.
*/
public void testCanonicalDecomposition() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
"//*[@numFound='3']",
"//result/doc[1]/int[@name='id'][.=2]",
"//result/doc[2]/int[@name='id'][.=3]",
"//result/doc[3]/int[@name='id'][.=5]"
);
}
/**
* Test termquery with custom collator (DIN 5007-2).
* In this case, ö is equivalent to oe (but not o)
*/
public void testCustomCollation() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_custom:toene", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=10]"
);
}
}

View File

@ -69,7 +69,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
* @see Locale
* @see RuleBasedCollator
* @since solr 3.1
* @deprecated use {@link org.apache.solr.schema.CollationField} instead.
*/
@Deprecated
public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Collator collator;

View File

@ -0,0 +1,250 @@
package org.apache.solr.schema;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.text.Collator;
import java.text.ParseException;
import java.text.RuleBasedCollator;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
/**
* Field for collated sort keys.
* These can be used for locale-sensitive sort and range queries.
* <p>
* This field can be created in two ways:
* <ul>
* <li>Based upon a system collator associated with a Locale.
* <li>Based upon a tailored ruleset.
* </ul>
* <p>
* Using a System collator:
* <ul>
* <li>language: ISO-639 language code (mandatory)
* <li>country: ISO-3166 country code (optional)
* <li>variant: vendor or browser-specific code (optional)
* <li>strength: 'primary','secondary','tertiary', or 'identical' (optional)
* <li>decomposition: 'no','canonical', or 'full' (optional)
* </ul>
* <p>
* Using a Tailored ruleset:
* <ul>
* <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
* <li>strength: 'primary','secondary','tertiary', or 'identical' (optional)
* <li>decomposition: 'no','canonical', or 'full' (optional)
* </ul>
*
* @see Collator
* @see Locale
* @see RuleBasedCollator
* @since solr 4.0
*/
public class CollationField extends FieldType {
private Analyzer analyzer;
@Override
protected void init(IndexSchema schema, Map<String,String> args) {
properties |= TOKENIZED; // this ensures our analyzer gets hit
setup(schema.getResourceLoader(), args);
super.init(schema, args);
}
/**
* Setup the field according to the provided parameters
*/
private void setup(ResourceLoader loader, Map<String,String> args) {
String custom = args.remove("custom");
String language = args.remove("language");
String country = args.remove("country");
String variant = args.remove("variant");
String strength = args.remove("strength");
String decomposition = args.remove("decomposition");
final Collator collator;
if (custom == null && language == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or language is required.");
if (custom != null &&
(language != null || country != null || variant != null))
throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both language and custom. "
+ "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ "Then save the entire customized ruleset to a file, and use with the custom parameter");
if (language != null) {
// create from a system collator, based on Locale.
collator = createFromLocale(language, country, variant);
} else {
// create from a custom ruleset
collator = createFromRules(custom, loader);
}
// set the strength flag, otherwise it will be the default.
if (strength != null) {
if (strength.equalsIgnoreCase("primary"))
collator.setStrength(Collator.PRIMARY);
else if (strength.equalsIgnoreCase("secondary"))
collator.setStrength(Collator.SECONDARY);
else if (strength.equalsIgnoreCase("tertiary"))
collator.setStrength(Collator.TERTIARY);
else if (strength.equalsIgnoreCase("identical"))
collator.setStrength(Collator.IDENTICAL);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
}
// set the decomposition flag, otherwise it will be the default.
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no"))
collator.setDecomposition(Collator.NO_DECOMPOSITION);
else if (decomposition.equalsIgnoreCase("canonical"))
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
else if (decomposition.equalsIgnoreCase("full"))
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
// we use 4.0 because it ensures we just encode the pure byte[] keys.
analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
}
/**
* Create a locale from language, with optional country and variant.
* Then return the appropriate collator for the locale.
*/
private Collator createFromLocale(String language, String country, String variant) {
Locale locale;
if (language != null && country == null && variant != null)
throw new SolrException(ErrorCode.SERVER_ERROR,
"To specify variant, country is required");
else if (language != null && country != null && variant != null)
locale = new Locale(language, country, variant);
else if (language != null && country != null)
locale = new Locale(language, country);
else
locale = new Locale(language);
return Collator.getInstance(locale);
}
/**
* Read custom rules from a file, and create a RuleBasedCollator
* The file cannot support comments, as # might be in the rules!
*/
private Collator createFromRules(String fileName, ResourceLoader loader) {
InputStream input = null;
try {
input = loader.openResource(fileName);
String rules = IOUtils.toString(input, "UTF-8");
return new RuleBasedCollator(rules);
} catch (IOException e) {
// io error
throw new RuntimeException(e);
} catch (ParseException e) {
// invalid rules
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(input);
}
}
@Override
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
writer.writeStr(name, f.stringValue(), true);
}
@Override
public SortField getSortField(SchemaField field, boolean top) {
return getStringSort(field, top);
}
@Override
public Analyzer getAnalyzer() {
return analyzer;
}
@Override
public Analyzer getQueryAnalyzer() {
return analyzer;
}
/**
* analyze the range with the analyzer, instead of the collator.
* because jdk collators might not be thread safe (when they are
* its just that all methods are synced), this keeps things
* simple (we already have a threadlocal clone in the reused TS)
*/
private BytesRef analyzeRangePart(String field, String part) {
TokenStream source;
try {
source = analyzer.reusableTokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
source = analyzer.tokenStream(field, new StringReader(part));
}
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
// we control the analyzer here: most errors are impossible
try {
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
termAtt.fillBytesRef();
assert !source.incrementToken();
} catch (IOException e) {
throw new RuntimeException("error analyzing range part: " + part, e);
}
try {
source.close();
} catch (IOException ignored) {}
return new BytesRef(bytes);
}
@Override
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
String f = field.getName();
BytesRef low = part1 == null ? null : analyzeRangePart(f, part1);
BytesRef high = part2 == null ? null : analyzeRangePart(f, part2);
return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
}
}

View File

@ -122,9 +122,9 @@ public class SolrQueryParser extends QueryParser {
SchemaField sf = schema.getFieldOrNull(field);
if (sf != null) {
FieldType ft = sf.getType();
// delegate to type for everything except TextField
if (ft instanceof TextField) {
return super.getFieldQuery(field, queryText, quoted || ((TextField)ft).getAutoGeneratePhraseQueries());
// delegate to type for everything except tokenized fields
if (ft.isTokenized()) {
return super.getFieldQuery(field, queryText, quoted || (ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries()));
} else {
return sf.getType().getFieldQuery(parser, sf, queryText);
}

View File

@ -0,0 +1,62 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for CollationField -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_ar_t" class="solr.CollationField" language="ar"/>
<fieldtype name="sort_de_t" class="solr.CollationField" language="de" strength="primary"/>
<fieldtype name="sort_tr_canon_t" class="solr.CollationField" language="tr" strength="primary" decomposition="canonical"/>
<fieldtype name="sort_zh_full_t" class="solr.CollationField" language="zh" strength="identical" decomposition="full"/>
<fieldtype name="sort_da_t" class="solr.CollationField" language="da" strength="primary"/>
<fieldtype name="sort_custom_t" class="solr.CollationField" custom="customrules.dat" strength="primary"/>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_tr_canon" type="sort_tr_canon_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_zh_full" type="sort_zh_full_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_da" type="sort_da_t" indexed="true" stored="false" multiValued="false"/>
<field name="sort_custom" type="sort_custom_t" indexed="true" stored="false" multiValued="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ar"/>
<copyField source="text" dest="sort_de"/>
<copyField source="text" dest="sort_tr_canon"/>
<copyField source="text" dest="sort_zh_full"/>
<copyField source="text" dest="sort_da"/>
<copyField source="text" dest="sort_custom"/>
</schema>

View File

@ -0,0 +1,23 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<config>
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
<requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
</config>

View File

@ -0,0 +1,198 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import java.io.File;
import java.io.FileOutputStream;
import java.text.Collator;
import java.text.RuleBasedCollator;
import java.util.Locale;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
/**
* Tests {@link CollationField} with TermQueries, RangeQueries, and sort order.
*/
public class TestCollationField extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
String home = setupSolrHome();
initCore("solrconfig.xml","schema.xml", home);
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
assertU(adoc("id", "4", "text", "Töne"));
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
assertU(adoc("id", "6", "text", ""));
assertU(adoc("id", "7", "text", "Tone"));
assertU(adoc("id", "8", "text", "Testing"));
assertU(adoc("id", "9", "text", "testing"));
assertU(adoc("id", "10", "text", "toene"));
assertU(adoc("id", "11", "text", "Tzne"));
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
/**
* Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
* These are largish files, and jvm-specific (as our documentation says, you should always
* look out for jvm differences with collation).
* So its preferable to create this file on-the-fly.
*/
public static String setupSolrHome() throws Exception {
// make a solr home underneath the test's TEMP_DIR
File tmpFile = File.createTempFile("test", "tmp", TEMP_DIR);
tmpFile.delete();
tmpFile.mkdir();
// make data and conf dirs
new File(tmpFile, "data").mkdir();
File confDir = new File(tmpFile, "conf");
confDir.mkdir();
// copy over configuration files
FileUtils.copyFile(getFile("solr/conf/solrconfig-collate.xml"), new File(confDir, "solrconfig.xml"));
FileUtils.copyFile(getFile("solr/conf/schema-collate.xml"), new File(confDir, "schema.xml"));
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
IOUtils.write(tailoredRules, os, "UTF-8");
os.close();
return tmpFile.getAbsolutePath();
}
/**
* Test termquery with german DIN 5007-1 primary strength.
* In this case, ö is equivalent to o (but not oe)
*/
public void testBasicTermQuery() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with the DIN 5007-1 collator.
* We do a range query of tone .. tp, in binary order this
* would retrieve nothing due to case and accent differences.
*/
public void testBasicRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test sort with a danish collator. ö is ordered after z
*/
public void testBasicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=11]",
"//result/doc[2]/int[@name='id'][.=4]"
);
}
/**
* Test sort with an arabic collator. U+0633 is ordered after U+0698.
* With a binary collator, the range would also return nothing.
*/
public void testArabicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=12]",
"//result/doc[2]/int[@name='id'][.=1]"
);
}
/**
* Test rangequery again with an Arabic collator.
* Binary order would normally order U+0633 in this range.
*/
public void testNegativeRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
"//*[@numFound='0']"
);
}
/**
* Test canonical decomposition with turkish primary strength.
* With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
* We index a decomposed form of İ.
*/
public void testCanonicalDecomposition() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
"//*[@numFound='3']",
"//result/doc[1]/int[@name='id'][.=2]",
"//result/doc[2]/int[@name='id'][.=3]",
"//result/doc[3]/int[@name='id'][.=5]"
);
}
/**
* Test full decomposition with chinese identical strength.
* The full width form "" is treated identical to "Testing"
*/
public void testFullDecomposition() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_zh_full:Testing", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=6]",
"//result/doc[2]/int[@name='id'][.=8]"
);
}
/**
* Test termquery with custom collator (DIN 5007-2).
* In this case, ö is equivalent to oe (but not o)
*/
public void testCustomCollation() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_custom:toene", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=10]"
);
}
}