LUCENE-2943: fix thread-safety issues with ICU collation

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1075850 13f79535-47bb-0310-9956-ffa450edef68
2011-03-01 15:47:14 +00:00 · 2011-03-01 15:47:14 +00:00 · 2509d35c11
parent e71e682352
commit 2509d35c11
8 changed files with 119 additions and 4 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -148,6 +148,9 @@ Bug fixes

 * LUCENE-2874: Highlighting overlapping tokens outputted doubled words.
   (Pierre Gossé via Robert Muir)
+
+ * LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter. 
+   (Robert Muir)
   
 API Changes

--- a/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
@ -85,7 +85,9 @@ public final class CollationKeyFilter extends TokenFilter {
   */
  public CollationKeyFilter(TokenStream input, Collator collator) {
    super(input);
-    this.collator = collator;
+    // clone in case JRE doesnt properly sync,
+    // or to reduce contention in case they do
+    this.collator = (Collator) collator.clone();
  }

  @Override
--- a/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
@ -34,7 +34,9 @@ public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
   * @param collator Collation key generator
   */
  public CollatedTermAttributeImpl(Collator collator) {
-    this.collator = collator;
+    // clone in case JRE doesnt properly sync,
+    // or to reduce contention in case they do
+    this.collator = (Collator) collator.clone();
  }
  
  @Override
--- a/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
@ -21,6 +21,8 @@ package org.apache.lucene.collation;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
@ -39,9 +41,12 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IndexableBinaryStringTools;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;

 import java.io.StringReader;
 import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;

 public abstract class CollationTestBase extends LuceneTestCase {

@ -252,4 +257,77 @@ public abstract class CollationTestBase extends LuceneTestCase {
    }
    assertEquals(expectedResult, buff.toString());
  }
+  
+  private String randomString() {
+    // ideally we could do this!
+    // return _TestUtil.randomUnicodeString(random);
+    //
+    // http://bugs.icu-project.org/trac/ticket/8060
+    // http://bugs.icu-project.org/trac/ticket/7732
+    // ...
+    // 
+    // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
+    int length = _TestUtil.nextInt(random, 0, 10);
+    char chars[] = new char[length];
+    for (int i = 0; i < length; i++) {
+      if (random.nextBoolean()) {
+        chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
+      } else {
+        chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
+      }
+    }
+    return new String(chars, 0, length);
+  }
+
+  public void assertThreadSafe(final Analyzer analyzer) throws Exception {
+    int numTestPoints = 1000 * RANDOM_MULTIPLIER;
+    int numThreads = _TestUtil.nextInt(random, 4, 8);
+    final HashMap<String,BytesRef> map = new HashMap<String,BytesRef>();
+    BytesRef spare = new BytesRef();
+    
+    // create a map<String,SortKey> up front.
+    // then with multiple threads, generate sort keys for all the keys in the map
+    // and ensure they are the same as the ones we produced in serial fashion.
+
+    for (int i = 0; i < numTestPoints; i++) {
+      String term = randomString();
+      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+      TermToBytesRefAttribute bytes = ts.addAttribute(TermToBytesRefAttribute.class);
+      ts.reset();
+      assertTrue(ts.incrementToken());
+      bytes.toBytesRef(spare);
+      // ensure we make a copy of the actual bytes too
+      map.put(term, new BytesRef(spare));
+    }
+    
+    Thread threads[] = new Thread[numThreads];
+    for (int i = 0; i < numThreads; i++) {
+      threads[i] = new Thread() {
+        @Override
+        public void run() {
+          try {
+            BytesRef spare = new BytesRef();
+            for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
+              String term = mapping.getKey();
+              BytesRef expected = mapping.getValue();
+              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+              TermToBytesRefAttribute bytes = ts.addAttribute(TermToBytesRefAttribute.class);
+              ts.reset();
+              assertTrue(ts.incrementToken());
+              bytes.toBytesRef(spare);
+              assertEquals(expected, spare);
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].start();
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].join();
+    }
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
@ -83,4 +83,14 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
    (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
     oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
  }
+  
+  public void testThreadSafe() throws Exception {
+    int iters = 20 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < iters; i++) {
+      Locale locale = randomLocale(random);
+      Collator collator = Collator.getInstance(locale);
+      collator.setStrength(Collator.PRIMARY);
+      assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+    }
+  }
 }
--- a/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
@ -84,7 +84,12 @@ public final class ICUCollationKeyFilter extends TokenFilter {
   */
  public ICUCollationKeyFilter(TokenStream input, Collator collator) {
    super(input);
-    this.collator = collator;
+    // clone the collator: see http://userguide.icu-project.org/collation/architecture
+    try {
+      this.collator = (Collator) collator.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
  }

  @Override
--- a/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java
@ -36,7 +36,12 @@ public class ICUCollatedTermAttributeImpl extends CharTermAttributeImpl {
   * @param collator Collation key generator
   */
  public ICUCollatedTermAttributeImpl(Collator collator) {
-    this.collator = collator;
+    // clone the collator: see http://userguide.icu-project.org/collation/architecture
+    try {
+      this.collator = (Collator) collator.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
  }
  
  @Override
--- a/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
+++ b/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
@ -84,4 +84,14 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
    (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
     "BFJHD", "ECAGI", "BJDFH", "BJDHF");
  }
+  
+  public void testThreadSafe() throws Exception {
+    int iters = 20 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < iters; i++) {
+      Locale locale = randomLocale(random);
+      Collator collator = Collator.getInstance(locale);
+      collator.setStrength(Collator.IDENTICAL);
+      assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+    }
+  }
 }