SOLR-5506: Support docValues in (ICU)CollationField

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1546245 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-11-28 00:31:03 +00:00
parent e5fbf6dce2
commit fcb7e37c29
7 changed files with 606 additions and 15 deletions

View File

@ -106,6 +106,9 @@ New Features
* SOLR-5492: Return the replica that actually served the query in shards.info * SOLR-5492: Return the replica that actually served the query in shards.info
response. (shalin) response. (shalin)
* SOLR-5506: Support docValues in CollationField and ICUCollationField.
(Robert Muir)
Bug Fixes Bug Fixes
---------------------- ----------------------

View File

@ -19,6 +19,9 @@ package org.apache.solr.schema;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -26,7 +29,12 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.collation.ICUCollationKeyAnalyzer; import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StorableField;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocTermOrdsRangeFilter;
import org.apache.lucene.search.FieldCacheRangeFilter;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TermRangeQuery;
@ -229,12 +237,12 @@ public class ICUCollationField extends FieldType {
} }
/** /**
* analyze the range with the analyzer, instead of the collator. * analyze the text with the analyzer, instead of the collator.
* because icu collators are not thread safe, this keeps things * because icu collators are not thread safe, this keeps things
* simple (we already have a threadlocal clone in the reused TS) * simple (we already have a threadlocal clone in the reused TS)
*/ */
private BytesRef analyzeRangePart(String field, String part) { private BytesRef getCollationKey(String field, String text) {
try (TokenStream source = analyzer.tokenStream(field, part)) { try (TokenStream source = analyzer.tokenStream(field, text)) {
source.reset(); source.reset();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
@ -242,22 +250,54 @@ public class ICUCollationField extends FieldType {
// we control the analyzer here: most errors are impossible // we control the analyzer here: most errors are impossible
if (!source.incrementToken()) if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
termAtt.fillBytesRef(); termAtt.fillBytesRef();
assert !source.incrementToken(); assert !source.incrementToken();
source.end(); source.end();
return BytesRef.deepCopyOf(bytes); return BytesRef.deepCopyOf(bytes);
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("Unable analyze range part: " + part, e); throw new RuntimeException("Unable to analyze text: " + text, e);
} }
} }
@Override @Override
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) { public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
String f = field.getName(); String f = field.getName();
BytesRef low = part1 == null ? null : analyzeRangePart(f, part1); BytesRef low = part1 == null ? null : getCollationKey(f, part1);
BytesRef high = part2 == null ? null : analyzeRangePart(f, part2); BytesRef high = part2 == null ? null : getCollationKey(f, part2);
return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive); if (!field.indexed() && field.hasDocValues()) {
if (field.multiValued()) {
return new ConstantScoreQuery(DocTermOrdsRangeFilter.newBytesRefRange(
field.getName(), low, high, minInclusive, maxInclusive));
} else {
return new ConstantScoreQuery(FieldCacheRangeFilter.newBytesRefRange(
field.getName(), low, high, minInclusive, maxInclusive));
}
} else {
return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
}
}
@Override
public void checkSchemaField(SchemaField field) {
// no-op
}
@Override
public List<StorableField> createFields(SchemaField field, Object value, float boost) {
if (field.hasDocValues()) {
List<StorableField> fields = new ArrayList<StorableField>();
fields.add(createField(field, value, boost));
final BytesRef bytes = getCollationKey(field.getName(), value.toString());
if (field.multiValued()) {
fields.add(new SortedSetDocValuesField(field.getName(), bytes));
} else {
fields.add(new SortedDocValuesField(field.getName(), bytes));
}
return fields;
} else {
return Collections.singletonList(createField(field, value, boost));
}
} }
} }

View File

@ -0,0 +1,59 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for CollationField (docvalues) -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_ar_t" class="solr.ICUCollationField" locale="ar"/>
<fieldtype name="sort_de_t" class="solr.ICUCollationField" locale="de" strength="primary"/>
<fieldtype name="sort_tr_canon_t" class="solr.ICUCollationField" locale="tr" strength="primary" decomposition="canonical"/>
<fieldtype name="sort_da_t" class="solr.ICUCollationField" locale="da" strength="primary"/>
<fieldtype name="sort_custom_t" class="solr.ICUCollationField" custom="customrules.dat" strength="primary"/>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ar" type="sort_ar_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_de" type="sort_de_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_tr_canon" type="sort_tr_canon_t" indexed="false" stored="false" multiValued="true" docValues="true"/>
<field name="sort_da" type="sort_da_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_custom" type="sort_custom_t" indexed="false" stored="false" multiValued="true" docValues="true"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ar"/>
<copyField source="text" dest="sort_de"/>
<copyField source="text" dest="sort_tr_canon"/>
<copyField source="text" dest="sort_da"/>
<copyField source="text" dest="sort_custom"/>
</schema>

View File

@ -0,0 +1,186 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import java.io.File;
import java.io.FileOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
/**
* Tests {@link ICUCollationField} with docValues.
*/
@SuppressCodecs({"Lucene40", "Lucene41"})
public class TestICUCollationFieldDocValues extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
String home = setupSolrHome();
initCore("solrconfig.xml","schema.xml", home);
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
assertU(adoc("id", "4", "text", "Töne"));
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
assertU(adoc("id", "6", "text", ""));
assertU(adoc("id", "7", "text", "Tone"));
assertU(adoc("id", "8", "text", "Testing"));
assertU(adoc("id", "9", "text", "testing"));
assertU(adoc("id", "10", "text", "toene"));
assertU(adoc("id", "11", "text", "Tzne"));
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
/**
* Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
* These are largish files, and jvm-specific (as our documentation says, you should always
* look out for jvm differences with collation).
* So its preferable to create this file on-the-fly.
*/
public static String setupSolrHome() throws Exception {
// make a solr home underneath the test's TEMP_DIR
File tmpFile = File.createTempFile("test", "tmp", TEMP_DIR);
tmpFile.delete();
tmpFile.mkdir();
// make data and conf dirs
new File(tmpFile + "/collection1", "data").mkdirs();
File confDir = new File(tmpFile + "/collection1", "conf");
confDir.mkdirs();
// copy over configuration files
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"), new File(confDir, "schema.xml"));
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
IOUtils.write(tailoredRules, os, "UTF-8");
os.close();
return tmpFile.getAbsolutePath();
}
/**
* Test termquery with german DIN 5007-1 primary strength.
* In this case, ö is equivalent to o (but not oe)
*/
public void testBasicTermQuery() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with the DIN 5007-1 collator.
* We do a range query of tone .. tp, in binary order this
* would retrieve nothing due to case and accent differences.
*/
public void testBasicRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test sort with a danish collator. ö is ordered after z
*/
public void testBasicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=11]",
"//result/doc[2]/int[@name='id'][.=4]"
);
}
/**
* Test sort with an arabic collator. U+0633 is ordered after U+0698.
* With a binary collator, the range would also return nothing.
*/
public void testArabicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=12]",
"//result/doc[2]/int[@name='id'][.=1]"
);
}
/**
* Test rangequery again with an Arabic collator.
* Binary order would normally order U+0633 in this range.
*/
public void testNegativeRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
"//*[@numFound='0']"
);
}
/**
* Test canonical decomposition with turkish primary strength.
* With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
* We index a decomposed form of İ.
*/
public void testCanonicalDecomposition() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
"//*[@numFound='3']",
"//result/doc[1]/int[@name='id'][.=2]",
"//result/doc[2]/int[@name='id'][.=3]",
"//result/doc[3]/int[@name='id'][.=5]"
);
}
/**
* Test termquery with custom collator (DIN 5007-2).
* In this case, ö is equivalent to oe (but not o)
*/
public void testCustomCollation() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_custom:toene", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=10]"
);
}
}

View File

@ -22,6 +22,9 @@ import java.io.InputStream;
import java.text.Collator; import java.text.Collator;
import java.text.ParseException; import java.text.ParseException;
import java.text.RuleBasedCollator; import java.text.RuleBasedCollator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
@ -30,7 +33,12 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.collation.CollationKeyAnalyzer; import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StorableField;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocTermOrdsRangeFilter;
import org.apache.lucene.search.FieldCacheRangeFilter;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TermRangeQuery;
@ -209,30 +217,62 @@ public class CollationField extends FieldType {
* its just that all methods are synced), this keeps things * its just that all methods are synced), this keeps things
* simple (we already have a threadlocal clone in the reused TS) * simple (we already have a threadlocal clone in the reused TS)
*/ */
private BytesRef analyzeRangePart(String field, String part) { private BytesRef getCollationKey(String field, String text) {
try (TokenStream source = analyzer.tokenStream(field, part)) { try (TokenStream source = analyzer.tokenStream(field, text)) {
source.reset(); source.reset();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef(); BytesRef bytes = termAtt.getBytesRef();
// we control the analyzer here: most errors are impossible // we control the analyzer here: most errors are impossible
if (!source.incrementToken()) if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
termAtt.fillBytesRef(); termAtt.fillBytesRef();
assert !source.incrementToken(); assert !source.incrementToken();
source.end(); source.end();
return BytesRef.deepCopyOf(bytes); return BytesRef.deepCopyOf(bytes);
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("Unable to analyze range part: " + part, e); throw new RuntimeException("Unable to analyze text: " + text, e);
} }
} }
@Override @Override
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) { public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
String f = field.getName(); String f = field.getName();
BytesRef low = part1 == null ? null : analyzeRangePart(f, part1); BytesRef low = part1 == null ? null : getCollationKey(f, part1);
BytesRef high = part2 == null ? null : analyzeRangePart(f, part2); BytesRef high = part2 == null ? null : getCollationKey(f, part2);
return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive); if (!field.indexed() && field.hasDocValues()) {
if (field.multiValued()) {
return new ConstantScoreQuery(DocTermOrdsRangeFilter.newBytesRefRange(
field.getName(), low, high, minInclusive, maxInclusive));
} else {
return new ConstantScoreQuery(FieldCacheRangeFilter.newBytesRefRange(
field.getName(), low, high, minInclusive, maxInclusive));
}
} else {
return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
}
}
@Override
public void checkSchemaField(SchemaField field) {
// no-op
}
@Override
public List<StorableField> createFields(SchemaField field, Object value, float boost) {
if (field.hasDocValues()) {
List<StorableField> fields = new ArrayList<StorableField>();
fields.add(createField(field, value, boost));
final BytesRef bytes = getCollationKey(field.getName(), value.toString());
if (field.multiValued()) {
fields.add(new SortedSetDocValuesField(field.getName(), bytes));
} else {
fields.add(new SortedDocValuesField(field.getName(), bytes));
}
return fields;
} else {
return Collections.singletonList(createField(field, value, boost));
}
} }
} }

View File

@ -0,0 +1,62 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for CollationField -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="sort_ar_t" class="solr.CollationField" language="ar"/>
<fieldtype name="sort_de_t" class="solr.CollationField" language="de" strength="primary"/>
<fieldtype name="sort_tr_canon_t" class="solr.CollationField" language="tr" strength="primary" decomposition="canonical"/>
<fieldtype name="sort_zh_full_t" class="solr.CollationField" language="zh" strength="identical" decomposition="full"/>
<fieldtype name="sort_da_t" class="solr.CollationField" language="da" strength="primary"/>
<fieldtype name="sort_custom_t" class="solr.CollationField" custom="customrules.dat" strength="primary"/>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="sort_ar" type="sort_ar_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_de" type="sort_de_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_tr_canon" type="sort_tr_canon_t" indexed="false" stored="false" multiValued="true" docValues="true"/>
<field name="sort_zh_full" type="sort_zh_full_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_da" type="sort_da_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
<field name="sort_custom" type="sort_custom_t" indexed="false" stored="false" multiValued="true" docValues="true"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<!-- copy our text to some sort fields with different orders -->
<copyField source="text" dest="sort_ar"/>
<copyField source="text" dest="sort_de"/>
<copyField source="text" dest="sort_tr_canon"/>
<copyField source="text" dest="sort_zh_full"/>
<copyField source="text" dest="sort_da"/>
<copyField source="text" dest="sort_custom"/>
</schema>

View File

@ -0,0 +1,201 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import java.io.File;
import java.io.FileOutputStream;
import java.text.Collator;
import java.text.RuleBasedCollator;
import java.util.Locale;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
/**
* Tests {@link CollationField} with docvalues
*/
@SuppressCodecs({"Lucene40", "Lucene41"})
public class TestCollationFieldDocValues extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
String home = setupSolrHome();
initCore("solrconfig.xml","schema.xml", home);
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
assertU(adoc("id", "4", "text", "Töne"));
assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
assertU(adoc("id", "6", "text", ""));
assertU(adoc("id", "7", "text", "Tone"));
assertU(adoc("id", "8", "text", "Testing"));
assertU(adoc("id", "9", "text", "testing"));
assertU(adoc("id", "10", "text", "toene"));
assertU(adoc("id", "11", "text", "Tzne"));
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
/**
* Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
* These are largish files, and jvm-specific (as our documentation says, you should always
* look out for jvm differences with collation).
* So its preferable to create this file on-the-fly.
*/
public static String setupSolrHome() throws Exception {
// make a solr home underneath the test's TEMP_DIR
File tmpFile = _TestUtil.getTempDir("collation1");
tmpFile.delete();
tmpFile.mkdir();
// make data and conf dirs
new File(tmpFile, "data").mkdir();
File confDir = new File(tmpFile + "/collection1", "conf");
confDir.mkdirs();
// copy over configuration files
FileUtils.copyFile(getFile("solr/collection1/conf/solrconfig-basic.xml"), new File(confDir, "solrconfig.xml"));
FileUtils.copyFile(getFile("solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml"), new File(confDir, "solrconfig.snippet.randomindexconfig.xml"));
FileUtils.copyFile(getFile("solr/collection1/conf/schema-collate-dv.xml"), new File(confDir, "schema.xml"));
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
IOUtils.write(tailoredRules, os, "UTF-8");
os.close();
return tmpFile.getAbsolutePath();
}
/**
* Test termquery with german DIN 5007-1 primary strength.
* In this case, ö is equivalent to o (but not oe)
*/
public void testBasicTermQuery() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test rangequery again with the DIN 5007-1 collator.
* We do a range query of tone .. tp, in binary order this
* would retrieve nothing due to case and accent differences.
*/
public void testBasicRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=7]"
);
}
/**
* Test sort with a danish collator. ö is ordered after z
*/
public void testBasicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=11]",
"//result/doc[2]/int[@name='id'][.=4]"
);
}
/**
* Test sort with an arabic collator. U+0633 is ordered after U+0698.
* With a binary collator, the range would also return nothing.
*/
public void testArabicSort() {
assertQ("Collated Sort: ",
req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=12]",
"//result/doc[2]/int[@name='id'][.=1]"
);
}
/**
* Test rangequery again with an Arabic collator.
* Binary order would normally order U+0633 in this range.
*/
public void testNegativeRangeQuery() {
assertQ("Collated RangeQ: ",
req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
"//*[@numFound='0']"
);
}
/**
* Test canonical decomposition with turkish primary strength.
* With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
* We index a decomposed form of İ.
*/
public void testCanonicalDecomposition() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
"//*[@numFound='3']",
"//result/doc[1]/int[@name='id'][.=2]",
"//result/doc[2]/int[@name='id'][.=3]",
"//result/doc[3]/int[@name='id'][.=5]"
);
}
/**
* Test full decomposition with chinese identical strength.
* The full width form "" is treated identical to "Testing"
*/
public void testFullDecomposition() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_zh_full:Testing", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=6]",
"//result/doc[2]/int[@name='id'][.=8]"
);
}
/**
* Test termquery with custom collator (DIN 5007-2).
* In this case, ö is equivalent to oe (but not o)
*/
public void testCustomCollation() {
assertQ("Collated TQ: ",
req("fl", "id", "q", "sort_custom:toene", "sort", "id asc" ),
"//*[@numFound='2']",
"//result/doc[1]/int[@name='id'][.=4]",
"//result/doc[2]/int[@name='id'][.=10]"
);
}
}