LUCENE-2124: move jdk collation to core, icu collation to icu contrib

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888780 13f79535-47bb-0310-9956-ffa450edef68
2009-12-09 12:08:06 +00:00 · 2009-12-09 12:08:06 +00:00 · 550a4ef1af
parent 3df3ab8e9f
commit 550a4ef1af
20 changed files with 209 additions and 91 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -83,6 +83,10 @@ Optimizations

 Build

+ * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
+   into core, and moved the ICU-based collation support into contrib/icu.  
+   (Robert Muir)
+
 Test Cases

 * LUCENE-2037 Allow Junit4 tests in our envrionment (Erick Erickson
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -30,6 +30,6 @@ Includes lib/servlet-api-2.4.jar from  Apache Tomcat
 The SmartChineseAnalyzer source code (under contrib/analyzers) was
 provided by Xiaoping Gao and copyright 2009 by www.imdict.net.

-ICU4J, (under contrib/collation) is licensed under an MIT styles license
-(contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 
+ICU4J, (under contrib/icu) is licensed under an MIT styles license
+(contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 
 International Business Machines Corporation and others
--- a/build.xml
+++ b/build.xml
@ -300,7 +300,7 @@
          <packageset dir="contrib/analyzers/smartcn/src/java"/>
          <packageset dir="contrib/ant/src/java"/>
          <packageset dir="contrib/benchmark/src/java"/>
-          <packageset dir="contrib/collation/src/java"/>
+          <packageset dir="contrib/icu/src/java"/>
          <packageset dir="contrib/db/bdb-je/src/java"/>
          <packageset dir="contrib/db/bdb/src/java"/>
          <packageset dir="contrib/fast-vector-highlighter/src/java"/>
@ -334,7 +334,7 @@
          <group title="contrib: Analysis" packages="org.apache.lucene.analysis.*"/>
          <group title="contrib: Ant" packages="org.apache.lucene.ant*"/>
          <group title="contrib: Benchmark" packages="org.apache.lucene.benchmark*"/>
-          <group title="contrib: Collation" packages="org.apache.lucene.collation*"/>
+          <group title="contrib: ICU" packages="org.apache.lucene.collation*"/>
          <group title="contrib: DB" packages="org.apache.lucene.store.db*:org.apache.lucene.store.je*:com.sleepycat*"/>
          <group title="contrib: Fast Vector Highlighter" packages="org.apache.lucene.search.vectorhighlight*"/>
          <group title="contrib: Highlighter" packages="org.apache.lucene.search.highlight*"/>
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -37,6 +37,12 @@ New features
   
 * LUCENE-2062: Add a Bulgarian analyzer.  (Robert Muir, Simon Willnauer)

+Build
+
+ * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
+   into core, and moved the ICU-based collation support into contrib/icu.  
+   (Robert Muir)
+
 Test Cases

 * LUCENE-2115: Cutover contrib tests to use Java5 generics.  (Kay Kay
--- a/contrib/collation/build.xml
+++ b/contrib/collation/build.xml
@ -17,17 +17,16 @@
    limitations under the License.
 -->

-<project name="collation" default="default">
+<project name="icu" default="default">

  <description>
-    CollationKeyFilter, ICUCollationKeyFilter, CollationKeyAnalyzer, and
-    ICUCollationKeyAnalyzer - converts tokens into indexable collation keys
+  	Provides integration with ICU (International Components for Unicode) for
+  	stronger Unicode and internationalization support. 
  </description>


  <path id="additional.dependencies">
    <fileset dir="lib" includes="icu4j-*.jar"/>
-    <pathelement location="../../build/contrib/misc/classes/java"/>
  </path>

  <pathconvert property="project.classpath"
@ -37,16 +36,4 @@

  <import file="../contrib-build.xml"/>

-  <target name="compile-misc">
-    <subant target="compile">
-       <fileset dir="${common.dir}/contrib/misc" includes="build.xml"/>
-    </subant>
-  </target>
-
-  <target name="init" depends="common.init,compile-misc"/>
-
-  <target name="compile" depends="init">
-    <antcall target="common.compile" inheritRefs="true" />
-  </target>
-
 </project>
--- a/contrib/collation/lib/ICU-LICENSE.txt
+++ b/contrib/collation/lib/ICU-LICENSE.txt
--- a/contrib/collation/lib/icu4j-collation-4.0.jar
+++ b/contrib/collation/lib/icu4j-collation-4.0.jar
--- a/contrib/collation/pom.xml.template
+++ b/contrib/collation/pom.xml.template
@ -27,14 +27,14 @@
    <version>@version@</version>
  </parent>
  <groupId>org.apache.lucene</groupId>
-  <artifactId>lucene-collation</artifactId>
+  <artifactId>lucene-icu</artifactId>
  <name>
-    Lucene CollationKeyFilter/Analyzer &amp; ICUCollationKeyFilter/Analyzer
+    Lucene ICUCollationKeyFilter/Analyzer
  </name>
  <version>@version@</version>
  <description>    
-    CollationKeyFilter, ICUCollationKeyFilter, CollationKeyAnalyzer, and
-    ICUCollationKeyAnalyzer - converts tokens into indexable collation keys
+  	Provides integration with ICU (International Components for Unicode) for
+  	stronger Unicode and internationalization support.
  </description>
  <packaging>jar</packaging>
  <dependencies>
--- a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
+++ b/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
--- a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
+++ b/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
--- a/contrib/collation/src/java/org/apache/lucene/collation/package.html
+++ b/contrib/collation/src/java/org/apache/lucene/collation/package.html
@ -21,8 +21,8 @@
 </head>
 <body>
 <p>
-  <code>CollationKeyFilter</code> and <code>ICUCollationKeyFilter</code>
-  convert each token into its binary <code>CollationKey</code> using the 
+  <code>ICUCollationKeyFilter</code>
+  converts each token into its binary <code>CollationKey</code> using the 
  provided <code>Collator</code>, and then encode the <code>CollationKey</code>
  as a String using
  {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be 
@ -60,9 +60,8 @@

 <h3>Farsi Range Queries</h3>
 <code><pre>
-  // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
  Collator collator = Collator.getInstance(new Locale("ar"));
-  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+  ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
  RAMDirectory ramDir = new RAMDirectory();
  IndexWriter writer = new IndexWriter
    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
@ -92,7 +91,7 @@
 <h3>Danish Sorting</h3>
 <code><pre>
  Analyzer analyzer 
-    = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+    = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
  RAMDirectory indexStore = new RAMDirectory();
  IndexWriter writer = new IndexWriter 
    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
@ -121,7 +120,7 @@
 <code><pre>
  Collator collator = Collator.getInstance(new Locale("tr", "TR"));
  collator.setStrength(Collator.PRIMARY);
-  Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
  RAMDirectory ramDir = new RAMDirectory();
  IndexWriter writer = new IndexWriter
    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
--- a/contrib/collation/src/java/overview.html
+++ b/contrib/collation/src/java/overview.html
@ -17,8 +17,7 @@
 <html>
  <head>
    <title>
-      Apache Lucene CollationKeyFilter/Analyzer and 
-      ICUCollationKeyFilter/Analyzer
+      Apache Lucene ICUCollationKeyFilter/Analyzer
    </title>
  </head>
  <body>
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
+++ b/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
@ -37,11 +37,6 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
  private String secondRangeEnd = encodeCollationKey
    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-
-  
-  public void testFarsiQueryParserCollating() throws Exception {
-    testFarsiQueryParserCollating(analyzer);
-  }
  
  public void testFarsiRangeFilterCollating() throws Exception {
    testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, 
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
+++ b/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
@ -57,11 +57,6 @@ public class TestICUCollationKeyFilter extends CollationTestBase {
    }
  }

-  public void testFarsiQueryParserCollating() throws Exception {
-    testFarsiQueryParserCollating(analyzer);
-  }
-  
-  
  public void testFarsiRangeFilterCollating() throws Exception {
    testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, 
                                  secondRangeBeginning, secondRangeEnd);
--- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@ -59,7 +59,8 @@ import java.io.IOException;
 *   </li>
 * </ol> 
 * <p>
- *   {@link ICUCollationKeyAnalyzer} uses ICU4J's Collator, which makes 
+ *   The <code>ICUCollationKeyAnalyzer</code> in the icu package of Lucene's
+ *   contrib area uses ICU4J's Collator, which makes its
 *   its version available, thus allowing collation to be versioned
 *   independently from the JVM.  ICUCollationKeyAnalyzer is also significantly
 *   faster and generates significantly shorter keys than CollationKeyAnalyzer.
--- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
+++ b/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
@ -20,7 +20,6 @@ package org.apache.lucene.collation;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.IndexableBinaryStringTools;

@ -58,7 +57,8 @@ import java.text.Collator;
 *   </li>
 * </ol> 
 * <p>
- *   {@link ICUCollationKeyFilter} uses ICU4J's Collator, which makes its
+ *   The <code>ICUCollationKeyFilter</code> in the icu package of Lucene's
+ *   contrib area uses ICU4J's Collator, which makes its
 *   version available, thus allowing collation to be versioned independently
 *   from the JVM.  ICUCollationKeyFilter is also significantly faster and
 *   generates significantly shorter keys than CollationKeyFilter.  See
@ -71,7 +71,7 @@ import java.text.Collator;
 *   CollationKeys generated by java.text.Collators are not compatible
 *   with those those generated by ICU Collators.  Specifically, if you use 
 *   CollationKeyFilter to generate index terms, do not use
- *   {@link ICUCollationKeyFilter} on the query side, or vice versa.
+ *   ICUCollationKeyFilter on the query side, or vice versa.
 * </p>
 */
 public final class CollationKeyFilter extends TokenFilter {
--- a/src/java/org/apache/lucene/collation/package.html
+++ b/src/java/org/apache/lucene/collation/package.html
@ -0,0 +1,176 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+  <title>Lucene Collation Package</title>
+</head>
+<body>
+<p>
+  <code>CollationKeyFilter</code>
+  converts each token into its binary <code>CollationKey</code> using the 
+  provided <code>Collator</code>, and then encode the <code>CollationKey</code>
+  as a String using
+  {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be 
+  stored as an index term.
+</p>
+
+<h2>Use Cases</h2>
+
+<ul>
+  <li>
+    Efficient sorting of terms in languages that use non-Unicode character 
+    orderings.  (Lucene Sort using a Locale can be very slow.) 
+  </li>
+  <li>
+    Efficient range queries over fields that contain terms in languages that 
+    use non-Unicode character orderings.  (Range queries using a Locale can be
+    very slow.)
+  </li>
+  <li>
+    Effective Locale-specific normalization (case differences, diacritics, etc.).
+    ({@link org.apache.lucene.analysis.LowerCaseFilter} and 
+    {@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services
+    in a generic way that doesn't take into account locale-specific needs.)
+  </li>
+</ul>
+
+<h2>Example Usages</h2>
+
+<h3>Farsi Range Queries</h3>
+<code><pre>
+  // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
+  Collator collator = Collator.getInstance(new Locale("ar"));
+  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+  RAMDirectory ramDir = new RAMDirectory();
+  IndexWriter writer = new IndexWriter
+    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  Document doc = new Document();
+  doc.add(new Field("content", "\u0633\u0627\u0628", 
+                    Field.Store.YES, Field.Index.ANALYZED));
+  writer.addDocument(doc);
+  writer.close();
+  IndexSearcher is = new IndexSearcher(ramDir, true);
+
+  // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
+  // to be passed through an analyzer - Lucene's standard QueryParser does not
+  // allow this.
+  AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
+  aqp.setLowercaseExpandedTerms(false);
+  
+  // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
+  // orders the U+0698 character before the U+0633 character, so the single
+  // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
+  // with a Farsi Collator (or an Arabic one for the case when Farsi is not
+  // supported).
+  ScoreDoc[] result
+    = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
+  assertEquals("The index Term should not be included.", 0, result.length);
+</pre></code>
+
+<h3>Danish Sorting</h3>
+<code><pre>
+  Analyzer analyzer 
+    = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+  RAMDirectory indexStore = new RAMDirectory();
+  IndexWriter writer = new IndexWriter 
+    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  String[] tracer = new String[] { "A", "B", "C", "D", "E" };
+  String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
+  String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
+  for (int i = 0 ; i < data.length ; ++i) {
+    Document doc = new Document();
+    doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
+    doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+  }
+  writer.close();
+  Searcher searcher = new IndexSearcher(indexStore, true);
+  Sort sort = new Sort();
+  sort.setSort(new SortField("contents", SortField.STRING));
+  Query query = new MatchAllDocsQuery();
+  ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
+  for (int i = 0 ; i < result.length ; ++i) {
+    Document doc = searcher.doc(result[i].doc);
+    assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
+  }
+</pre></code>
+
+<h3>Turkish Case Normalization</h3>
+<code><pre>
+  Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+  collator.setStrength(Collator.PRIMARY);
+  Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  RAMDirectory ramDir = new RAMDirectory();
+  IndexWriter writer = new IndexWriter
+    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  Document doc = new Document();
+  doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
+  writer.addDocument(doc);
+  writer.close();
+  IndexSearcher is = new IndexSearcher(ramDir, true);
+  QueryParser parser = new QueryParser("contents", analyzer);
+  Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
+  ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
+  assertEquals("The index Term should be included.", 1, result.length);
+</pre></code>
+
+<h2>Caveats and Comparisons</h2>
+<p>
+  <strong>WARNING:</strong> Make sure you use exactly the same 
+  <code>Collator</code> at index and query time -- <code>CollationKey</code>s
+  are only comparable when produced by
+  the same <code>Collator</code>.  Since {@link java.text.RuleBasedCollator}s
+  are not independently versioned, it is unsafe to search against stored
+  <code>CollationKey</code>s unless the following are exactly the same (best 
+  practice is to store this information with the index and check that they
+  remain the same at query time):
+</p>
+<ol>
+  <li>JVM vendor</li>
+  <li>JVM version, including patch version</li>
+  <li>
+    The language (and country and variant, if specified) of the Locale
+    used when constructing the collator via
+    {@link java.text.Collator#getInstance(java.util.Locale)}.
+  </li>
+  <li>
+    The collation strength used - see {@link java.text.Collator#setStrength(int)}
+  </li>
+</ol> 
+<p>
+  <code>ICUCollationKeyFilter</code>, available in the icu package in Lucene's contrib area,
+  uses ICU4J's <code>Collator</code>, which 
+  makes its version available, thus allowing collation to be versioned
+  independently from the JVM.  <code>ICUCollationKeyFilter</code> is also 
+  significantly faster and generates significantly shorter keys than 
+  <code>CollationKeyFilter</code>.  See
+  <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+    >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+  generation timing and key length comparisons between ICU4J and
+  <code>java.text.Collator</code> over several languages.
+</p>
+<p>
+  <code>CollationKey</code>s generated by <code>java.text.Collator</code>s are 
+  not compatible with those those generated by ICU Collators.  Specifically, if
+  you use <code>CollationKeyFilter</code> to generate index terms, do not use
+  <code>ICUCollationKeyFilter</code> on the query side, or vice versa.
+</p>
+<pre>
+</pre>
+</body>
+</html>
--- a/contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java
+++ b/contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java
@ -38,7 +38,6 @@ import org.apache.lucene.search.SortField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.util.IndexableBinaryStringTools;
-import org.apache.lucene.queryParser.analyzing.AnalyzingQueryParser;
 import org.apache.lucene.util.Version;

 import java.io.IOException;
@ -71,40 +70,7 @@ public class CollationTestBase extends TestCase {
    IndexableBinaryStringTools.encode(begBuf, encodedBegBuf);
    return new String(encodedBegArray);
  }
-  
-  public void testFarsiQueryParserCollating(Analyzer analyzer) throws Exception {
-
-    RAMDirectory ramDir = new RAMDirectory();
-    IndexWriter writer = new IndexWriter
-      (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
-    Document doc = new Document();
-    doc.add(new Field("content", "\u0633\u0627\u0628", 
-                      Field.Store.YES, Field.Index.ANALYZED));
-    writer.addDocument(doc);
-    writer.close();
-    IndexSearcher is = new IndexSearcher(ramDir, true);
-
-    AnalyzingQueryParser aqp = new AnalyzingQueryParser(Version.LUCENE_CURRENT, "content", analyzer);
-    aqp.setLowercaseExpandedTerms(false);
-
-    // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
-    // orders the U+0698 character before the U+0633 character, so the single
-    // index Term below should NOT be returned by a TermRangeQuery
-    // with a Farsi Collator (or an Arabic one for the case when Farsi is not
-    // supported).
-      
-    // Test TermRangeQuery
-    ScoreDoc[] result
-      = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
-    assertEquals("The index Term should not be included.", 0, result.length);
-
-    result = is.search(aqp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs;
-    assertEquals("The index Term should be included.", 1, result.length);
-
-    is.close();
-  }
-  
-  
+    
  public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg, 
                                            String firstEnd, String secondBeg,
                                            String secondEnd) throws Exception {
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
+++ b/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
@ -41,11 +41,6 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
  private String secondRangeEnd = encodeCollationKey
    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
  
-
-  public void testFarsiQueryParserCollating() throws Exception {
-    testFarsiQueryParserCollating(analyzer);
-  }
-  
  public void testFarsiRangeFilterCollating() throws Exception {
    testFarsiRangeFilterCollating
      (analyzer, firstRangeBeginning, firstRangeEnd, 
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
+++ b/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
@ -60,11 +60,6 @@ public class TestCollationKeyFilter extends CollationTestBase {
    }
  }

-  public void testFarsiQueryParserCollating() throws Exception {
-    testFarsiQueryParserCollating(analyzer);
-  }
-  
-  
  public void testFarsiRangeFilterCollating() throws Exception {
    testFarsiRangeFilterCollating
      (analyzer, firstRangeBeginning, firstRangeEnd,